Recursive sub folder search and return files in a list python Recursive sub folder search and return files in a list python python python

Recursive sub folder search and return files in a list python


You should be using the dirpath which you call root. The dirnames are supplied so you can prune it if there are folders that you don't wish os.walk to recurse into.

import osresult = [os.path.join(dp, f) for dp, dn, filenames in os.walk(PATH) for f in filenames if os.path.splitext(f)[1] == '.txt']

Edit:

After the latest downvote, it occurred to me that glob is a better tool for selecting by extension.

import osfrom glob import globresult = [y for x in os.walk(PATH) for y in glob(os.path.join(x[0], '*.txt'))]

Also a generator version

from itertools import chainresult = (chain.from_iterable(glob(os.path.join(x[0], '*.txt')) for x in os.walk('.')))

Edit2 for Python 3.4+

from pathlib import Pathresult = list(Path(".").rglob("*.[tT][xX][tT]"))


Changed in Python 3.5: Support for recursive globs using “**”.

glob.glob() got a new recursive parameter.

If you want to get every .txt file under my_path (recursively including subdirs):

import globfiles = glob.glob(my_path + '/**/*.txt', recursive=True)# my_path/     the dir# **/       every file and dir under my_path# *.txt     every file that ends with '.txt'

If you need an iterator you can use iglob as an alternative:

for file in glob.iglob(my_path, recursive=False):    # ...


This seems to be the fastest solution I could come up with, and is faster than os.walk and a lot faster than any glob solution.

  • It will also give you a list of all nested subfolders at basically no cost.
  • You can search for several different extensions.
  • You can also choose to return either full paths or just the names for the files by changing f.path to f.name (do not change it for subfolders!).

Args: dir: str, ext: list.
Function returns two lists: subfolders, files.

See below for a detailed speed anaylsis.

def run_fast_scandir(dir, ext):    # dir: str, ext: list    subfolders, files = [], []    for f in os.scandir(dir):        if f.is_dir():            subfolders.append(f.path)        if f.is_file():            if os.path.splitext(f.name)[1].lower() in ext:                files.append(f.path)    for dir in list(subfolders):        sf, f = run_fast_scandir(dir, ext)        subfolders.extend(sf)        files.extend(f)    return subfolders, filessubfolders, files = run_fast_scandir(folder, [".jpg"])

In case you need the file size, you can also create a sizes list and add f.stat().st_size like this for a display of MiB:

sizes.append(f"{f.stat().st_size/1024/1024:.0f} MiB")

Speed analysis

for various methods to get all files with a specific file extension inside all subfolders and the main folder.

tl;dr:

  • fast_scandir clearly wins and is twice as fast as all other solutions, except os.walk.
  • os.walk is second place slighly slower.
  • using glob will greatly slow down the process.
  • None of the results use natural sorting. This means results will be sorted like this: 1, 10, 2. To get natural sorting (1, 2, 10), please have a look at https://stackoverflow.com/a/48030307/2441026

**Results:**
fast_scandir    took  499 ms. Found files: 16596. Found subfolders: 439os.walk         took  589 ms. Found files: 16596find_files      took  919 ms. Found files: 16596glob.iglob      took  998 ms. Found files: 16596glob.glob       took 1002 ms. Found files: 16596pathlib.rglob   took 1041 ms. Found files: 16596os.walk-glob    took 1043 ms. Found files: 16596

Tests were done with W7x64, Python 3.8.1, 20 runs. 16596 files in 439 (partially nested) subfolders.
find_files is from https://stackoverflow.com/a/45646357/2441026 and lets you search for several extensions.
fast_scandir was written by myself and will also return a list of subfolders. You can give it a list of extensions to search for (I tested a list with one entry to a simple if ... == ".jpg" and there was no significant difference).


# -*- coding: utf-8 -*-# Python 3import timeimport osfrom glob import glob, iglobfrom pathlib import Pathdirectory = r"<folder>"RUNS = 20def run_os_walk():    a = time.time_ns()    for i in range(RUNS):        fu = [os.path.join(dp, f) for dp, dn, filenames in os.walk(directory) for f in filenames if                  os.path.splitext(f)[1].lower() == '.jpg']    print(f"os.walk\t\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")def run_os_walk_glob():    a = time.time_ns()    for i in range(RUNS):        fu = [y for x in os.walk(directory) for y in glob(os.path.join(x[0], '*.jpg'))]    print(f"os.walk-glob\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")def run_glob():    a = time.time_ns()    for i in range(RUNS):        fu = glob(os.path.join(directory, '**', '*.jpg'), recursive=True)    print(f"glob.glob\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")def run_iglob():    a = time.time_ns()    for i in range(RUNS):        fu = list(iglob(os.path.join(directory, '**', '*.jpg'), recursive=True))    print(f"glob.iglob\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")def run_pathlib_rglob():    a = time.time_ns()    for i in range(RUNS):        fu = list(Path(directory).rglob("*.jpg"))    print(f"pathlib.rglob\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")def find_files(files, dirs=[], extensions=[]):    # https://stackoverflow.com/a/45646357/2441026    new_dirs = []    for d in dirs:        try:            new_dirs += [ os.path.join(d, f) for f in os.listdir(d) ]        except OSError:            if os.path.splitext(d)[1].lower() in extensions:                files.append(d)    if new_dirs:        find_files(files, new_dirs, extensions )    else:        returndef run_fast_scandir(dir, ext):    # dir: str, ext: list    # https://stackoverflow.com/a/59803793/2441026    subfolders, files = [], []    for f in os.scandir(dir):        if f.is_dir():            subfolders.append(f.path)        if f.is_file():            if os.path.splitext(f.name)[1].lower() in ext:                files.append(f.path)    for dir in list(subfolders):        sf, f = run_fast_scandir(dir, ext)        subfolders.extend(sf)        files.extend(f)    return subfolders, filesif __name__ == '__main__':    run_os_walk()    run_os_walk_glob()    run_glob()    run_iglob()    run_pathlib_rglob()    a = time.time_ns()    for i in range(RUNS):        files = []        find_files(files, dirs=[directory], extensions=[".jpg"])    print(f"find_files\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(files)}")    a = time.time_ns()    for i in range(RUNS):        subf, files = run_fast_scandir(directory, [".jpg"])    print(f"fast_scandir\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(files)}. Found subfolders: {len(subf)}")