LinuxのCでディレクトリを再帰的にリストする方法は？

Question

Cプログラミングですべてのディレクトリとファイルを再帰的にリストする必要があります。 FTWを調べましたが、使用している2つのオペレーティングシステム（FedoraとMinix）には含まれていません。私は過去数時間にわたって読んださまざまなものから大きな頭痛を感じ始めています。

誰かがコードスニペットを知っていれば、それを見ることができて驚くでしょう。または、誰かがこれについて良い方向性を教えてくれるなら、とても感謝しています。

Lloyd Macrohon · Accepted Answer

再帰バージョンは次のとおりです。

#include <unistd.h> #include <sys/types.h> #include <dirent.h> #include <stdio.h> #include <string.h> void listdir(const char *name, int indent) { DIR *dir; struct dirent *entry; if (!(dir = opendir(name))) return; while ((entry = readdir(dir)) != NULL) { if (entry->d_type == DT_DIR) { char path[1024]; if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) continue; snprintf(path, sizeof(path), "%s/%s", name, entry->d_name); printf("%*s[%s]
", indent, "", entry->d_name); listdir(path, indent + 2); } else { printf("%*s- %s
", indent, "", entry->d_name); } } closedir(dir); } int main(void) { listdir(".", 0); return 0; }

Nominal Animal · Answer

なぜ誰もが何度も車輪の再発明を主張するのですか？

POSIX.1-2008は、Single Unix Specification v4（SuSv4）でも定義され、Linux（glibc、 man 3 nftw ）で利用可能な nftw() 関数を標準化しました、OS X、および最新のBSDバリアント。まったく新しいものではありません。

ナイーブopendir()/readdir()/closedir()ベースの実装では、ツリー走査中にディレクトリまたはファイルが移動、名前変更、または削除される場合はほとんど処理されませんが、nftw()はそれらを適切に処理する必要があります。

例として、現在の作業ディレクトリ、コマンドラインで指定された各ディレクトリ、またはコマンドラインで指定されたファイルのみで始まるディレクトリツリーをリストする次のCプログラムを考えます。

/* We want POSIX.1-2008 + XSI, i.e. SuSv4, features */ #define _XOPEN_SOURCE 700 /* Added on 2017-06-25: If the C library can support 64-bit file sizes and offsets, using the standard names, these defines tell the C library to do so. */ #define _LARGEFILE64_SOURCE #define _FILE_OFFSET_BITS 64 #include <stdlib.h> #include <unistd.h> #include <ftw.h> #include <time.h> #include <stdio.h> #include <string.h> #include <errno.h> /* POSIX.1 says each process has at least 20 file descriptors. * Three of those belong to the standard streams. * Here, we use a conservative estimate of 15 available; * assuming we use at most two for other uses in this program, * we should never run into any problems. * Most trees are shallower than that, so it is efficient. * Deeper trees are traversed fine, just a bit slower. * (Linux allows typically hundreds to thousands of open files, * so you'll probably never see any issues even if you used * a much higher value, say a couple of hundred, but * 15 is a safe, reasonable value.) */ #ifndef USE_FDS #define USE_FDS 15 #endif int print_entry(const char *filepath, const struct stat *info, const int typeflag, struct FTW *pathinfo) { /* const char *const filename = filepath + pathinfo->base; */ const double bytes = (double)info->st_size; /* Not exact if large! */ struct tm mtime; localtime_r(&(info->st_mtime), &mtime); printf("%04d-%02d-%02d %02d:%02d:%02d", mtime.tm_year+1900, mtime.tm_mon+1, mtime.tm_mday, mtime.tm_hour, mtime.tm_min, mtime.tm_sec); if (bytes >= 1099511627776.0) printf(" %9.3f TiB", bytes / 1099511627776.0); else if (bytes >= 1073741824.0) printf(" %9.3f GiB", bytes / 1073741824.0); else if (bytes >= 1048576.0) printf(" %9.3f MiB", bytes / 1048576.0); else if (bytes >= 1024.0) printf(" %9.3f KiB", bytes / 1024.0); else printf(" %9.0f B ", bytes); if (typeflag == FTW_SL) { char *target; size_t maxlen = 1023; ssize_t len; while (1) { target = malloc(maxlen + 1); if (target == NULL) return ENOMEM; len = readlink(filepath, target, maxlen); if (len == (ssize_t)-1) { const int saved_errno = errno; free(target); return saved_errno; } if (len >= (ssize_t)maxlen) { free(target); maxlen += 1024; continue; } target[len] = '\0'; break; } printf(" %s -> %s
", filepath, target); free(target); } else if (typeflag == FTW_SLN) printf(" %s (dangling symlink)
", filepath); else if (typeflag == FTW_F) printf(" %s
", filepath); else if (typeflag == FTW_D || typeflag == FTW_DP) printf(" %s/
", filepath); else if (typeflag == FTW_DNR) printf(" %s/ (unreadable)
", filepath); else printf(" %s (unknown)
", filepath); return 0; } int print_directory_tree(const char *const dirpath) { int result; /* Invalid directory path? */ if (dirpath == NULL || *dirpath == '\0') return errno = EINVAL; result = nftw(dirpath, print_entry, USE_FDS, FTW_PHYS); if (result >= 0) errno = result; return errno; } int main(int argc, char *argv[]) { int arg; if (argc < 2) { if (print_directory_tree(".")) { fprintf(stderr, "%s.
", strerror(errno)); return EXIT_FAILURE; } } else { for (arg = 1; arg < argc; arg++) { if (print_directory_tree(argv[arg])) { fprintf(stderr, "%s.
", strerror(errno)); return EXIT_FAILURE; } } } return EXIT_SUCCESS; }

上記のコードのほとんどはprint_entry()にあります。そのタスクは、各ディレクトリエントリを印刷することです。 print_directory_tree()では、nftw()に、表示される各ディレクトリエントリに対して呼び出すように指示します。

上記の手作業での詳細は、nftw()で使用できるファイル記述子の数に関する決定のみです。プログラムがファイルツリーウォーク中に（標準ストリームに加えて）最大2つの追加ファイル記述子を使用する場合、15が安全であることが知られています（nftw()を持ち、ほとんどがPOSIX準拠であるすべてのシステム）。

Linuxでは、sysconf(_SC_OPEN_MAX)を使用して開いているファイルの最大数を検索し、nftw()呼び出しと同時に使用する数を減算できますが、気にしません（ユーティリティを知らない限り）ほとんどが病理学的に深いディレクトリ構造で使用されます）。 15個の記述子は、ツリーの深さを制限しませんnot。 nftw()は単に遅くなります（13ディレクトリよりも深いディレクトリを歩くと、ディレクトリ内の変更を検出しない場合がありますが、トレードオフと変更を検出する一般的な機能はシステムとCライブラリの実装によって異なります）。そのようなコンパイル時定数を使用するだけでコードの移植性が保たれます-Linuxだけでなく、Mac OS Xと現在のすべてのBSDバリアント、および他の古くないUnixバリアントでも動作するはずです。

Ruslanはコメントで、64ビットのサイズ/オフセットを必要とするファイルシステムエントリがあり、nftw64()の「通常」バージョンがerrno == EOVERFLOWで失敗したため、nftw()に切り替える必要があると述べました。正しい解決策は、GLIBC固有の64ビット関数に切り替えるのではなく、_LARGEFILE64_SOURCEおよび_FILE_OFFSET_BITS 64を定義することです。これらは、標準機能（nftw()、fstat()など）とタイプ名（off_tなど）を使用しながら、可能であれば64ビットのファイルサイズとオフセットに切り替えるようCライブラリに指示します。

Jan · Answer

int is_directory_we_want_to_list(const char *parent, char *name) { struct stat st_buf; if (!strcmp(".", name) || !strcmp("..", name)) return 0; char *path = alloca(strlen(name) + strlen(parent) + 2); sprintf(path, "%s/%s", parent, name); stat(path, &st_buf); return S_ISDIR(st_buf.st_mode); } int list(const char *name) { DIR *dir = opendir(name); struct dirent *ent; while (ent = readdir(dir)) { char *entry_name = ent->d_name; printf("%s
", entry_name); if (is_directory_we_want_to_list(name, entry_name)) { // You can consider using alloca instead. char *next = malloc(strlen(name) + strlen(entry_name) + 2); sprintf(next, "%s/%s", name, entry_name); list(next); free(next); } } closedir(dir); }

このコンテキストでスキミングする価値のあるヘッダーファイル： stat.h 、 dirent.h 。上記のコードは、発生する可能性のあるエラーをチェックしていないことに注意してください。

完全に異なるアプローチが、ftw.hで定義されている ftw によって提供されています。

Myst · Answer

私のコメントで述べたように、このタスクには2つの固有の欠陥があるという再帰的なアプローチを信じています。

最初の欠陥は、開いているファイルの制限です。この制限は、深いトラバーサルに制限を課します。十分なサブフォルダーがある場合、再帰的なアプローチは壊れます。（スタックオーバーフローに関する編集を参照）

2番目の欠陥はもう少し微妙です。再帰的なアプローチでは、ハードリンクのテストが非常に難しくなります。フォルダーツリーが循環している場合（ハードリンクが原因）、再帰的なアプローチは中断されます（スタックオーバーフローが発生しないことが望ましい）。（ハードリンクに関する編集を参照）

ただし、再帰を単一のファイル記述子とリンクリストに置き換えることにより、これらの問題を回避するのは非常に簡単です。

これは学校のプロジェクトではなく、再帰はオプションだと思います。

これがアプリケーションの例です。

つかいます a.out ./フォルダーツリーを表示します。

マクロなどをおびします。通常、インライン関数を使用しますが、すべてが単一の関数である場合、コードを追跡する方が簡単だと思いました。

#include <dirent.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> int main(int argc, char const *argv[]) { /* print use instruction unless a folder name was given */ if (argc < 2) fprintf(stderr, "
use:
" " %s <directory>
" "for example:
" " %s ./

", argv[0], argv[0]), exit(0); /*************** a small linked list macro implementation ***************/ typedef struct list_s { struct list_s *next; struct list_s *prev; } list_s; #define LIST_INIT(name) \ { .next = &name, .prev = &name } #define LIST_Push(dest, node) \ do { \ (node)->next = (dest)->next; \ (node)->prev = (dest); \ (node)->next->prev = (node); \ (dest)->next = (node); \ } while (0); #define LIST_POP(list, var) \ if ((list)->next == (list)) { \ var = NULL; \ } else { \ var = (list)->next; \ (list)->next = var->next; \ var->next->prev = var->prev; \ } /*************** a record (file / folder) item type ***************/ typedef struct record_s { /* this is a flat processing queue. */ list_s queue; /* this will list all queued and processed folders (cyclic protection) */ list_s folders; /* this will list all the completed items (siblings and such) */ list_s list; /* unique ID */ ino_t ino; /* name length */ size_t len; /* name string */ char name[]; } record_s; /* take a list_s pointer and convert it to the record_s pointer */ #define NODE2RECORD(node, list_name) \ ((record_s *)(((uintptr_t)(node)) - \ ((uintptr_t) & ((record_s *)0)->list_name))) /* initializes a new record */ #define RECORD_INIT(name) \ (record_s){.queue = LIST_INIT((name).queue), \ .folders = LIST_INIT((name).folders), \ .list = LIST_INIT((name).list)} /*************** the actual code ***************/ record_s records = RECORD_INIT(records); record_s *pos, *item; list_s *tmp; DIR *dir; struct dirent *entry; /* initialize the root folder record and add it to the queue */ pos = malloc(sizeof(*pos) + strlen(argv[1]) + 2); *pos = RECORD_INIT(*pos); pos->len = strlen(argv[1]); memcpy(pos->name, argv[1], pos->len); if (pos->name[pos->len - 1] != '/') pos->name[pos->len++] = '/'; pos->name[pos->len] = 0; /* Push to queue, but also Push to list (first item processed) */ LIST_Push(&records.queue, &pos->queue); LIST_Push(&records.list, &pos->list); /* as long as the queue has items to be processed, do so */ while (records.queue.next != &records.queue) { /* pop queued item */ LIST_POP(&records.queue, tmp); /* collect record to process */ pos = NODE2RECORD(tmp, queue); /* add record to the processed folder list */ LIST_Push(&records.folders, &pos->folders); /* process the folder and add all folder data to current list */ dir = opendir(pos->name); if (!dir) continue; while ((entry = readdir(dir)) != NULL) { /* create new item, copying it's path data and unique ID */ item = malloc(sizeof(*item) + pos->len + entry->d_namlen + 2); *item = RECORD_INIT(*item); item->len = pos->len + entry->d_namlen; memcpy(item->name, pos->name, pos->len); memcpy(item->name + pos->len, entry->d_name, entry->d_namlen); item->name[item->len] = 0; item->ino = entry->d_ino; /* add item to the list, right after the `pos` item */ LIST_Push(&pos->list, &item->list); /* unless it's a folder, we're done. */ if (entry->d_type != DT_DIR) continue; /* test for '.' and '..' */ if (entry->d_name[0] == '.' && (entry->d_name[1] == 0 || (entry->d_name[1] == '.' && entry->d_name[2] == 0))) continue; /* add folder marker */ item->name[item->len++] = '/'; item->name[item->len] = 0; /* test for cyclic processing */ list_s *t = records.folders.next; while (t != &records.folders) { if (NODE2RECORD(t, folders)->ino == item->ino) { /* we already processed this folder! */ break; /* this breaks from the small loop... */ } t = t->next; } if (t != &records.folders) continue; /* if we broke from the small loop, entry is done */ /* item is a new folder, add to queue */ LIST_Push(&records.queue, &item->queue); } closedir(dir); } /*************** Printing the results and cleaning up ***************/ while (records.list.next != &records.list) { /* pop list item */ LIST_POP(&records.list, tmp); /* collect record to process */ pos = NODE2RECORD(tmp, list); /* prepare for next iteration */ LIST_POP(&records.list, tmp); fwrite(pos->name, pos->len, 1, stderr); fwrite("
", 1, 1, stderr); free(pos); } return 0; }

[〜＃〜] edit [〜＃〜]

@Stargateurはコメントで、オープンコードの制限に達する前に再帰コードがスタックからオーバーフローする可能性があると述べました。

スタックオーバーフローの改善方法はわかりませんが、プロセスが呼び出されたときにファイル制限に近づいていない限り、この評価はおそらく正しいでしょう。

コメントで@Stargateurが言及した別のポイントは、再帰コードの深さはサブディレクトリの最大量（ext4ファイルシステムでは64000）によって制限され、ハードリンクは非常にありそうもないことです（フォルダへのハードリンクはLinux/Unixで許可されています）。

これは、コードがLinux上で実行されている場合（これは質問によると）、良いニュースですので、この問題は本当の懸念ではありません（macOSまたはおそらくWindowsでコードを実行しない限り）...再帰的にスタックを大きく開く可能性があります。

そうは言っても、非再帰オプションには、処理されるアイテムの量に簡単に制限を加えたり、結果をキャッシュできるなどの利点があります。

P.S。

コメントによると、これは循環階層をチェックしないコードの非再帰バージョンです。これは高速で、フォルダーへのハードリンクが許可されていないLinuxマシンで使用するのに十分安全である必要があります。

#include <dirent.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> int main(int argc, char const *argv[]) { /* print use instruction unless a folder name was given */ if (argc < 2) fprintf(stderr, "
use:
" " %s <directory>
" "for example:
" " %s ./

", argv[0], argv[0]), exit(0); /*************** a small linked list macro implementation ***************/ typedef struct list_s { struct list_s *next; struct list_s *prev; } list_s; #define LIST_INIT(name) \ { .next = &name, .prev = &name } #define LIST_Push(dest, node) \ do { \ (node)->next = (dest)->next; \ (node)->prev = (dest); \ (node)->next->prev = (node); \ (dest)->next = (node); \ } while (0); #define LIST_POP(list, var) \ if ((list)->next == (list)) { \ var = NULL; \ } else { \ var = (list)->next; \ (list)->next = var->next; \ var->next->prev = var->prev; \ } /*************** a record (file / folder) item type ***************/ typedef struct record_s { /* this is a flat processing queue. */ list_s queue; /* this will list all the completed items (siblings and such) */ list_s list; /* unique ID */ ino_t ino; /* name length */ size_t len; /* name string */ char name[]; } record_s; /* take a list_s pointer and convert it to the record_s pointer */ #define NODE2RECORD(node, list_name) \ ((record_s *)(((uintptr_t)(node)) - \ ((uintptr_t) & ((record_s *)0)->list_name))) /* initializes a new record */ #define RECORD_INIT(name) \ (record_s){.queue = LIST_INIT((name).queue), .list = LIST_INIT((name).list)} /*************** the actual code ***************/ record_s records = RECORD_INIT(records); record_s *pos, *item; list_s *tmp; DIR *dir; struct dirent *entry; /* initialize the root folder record and add it to the queue */ pos = malloc(sizeof(*pos) + strlen(argv[1]) + 2); *pos = RECORD_INIT(*pos); pos->len = strlen(argv[1]); memcpy(pos->name, argv[1], pos->len); if (pos->name[pos->len - 1] != '/') pos->name[pos->len++] = '/'; pos->name[pos->len] = 0; /* Push to queue, but also Push to list (first item processed) */ LIST_Push(&records.queue, &pos->queue); LIST_Push(&records.list, &pos->list); /* as long as the queue has items to be processed, do so */ while (records.queue.next != &records.queue) { /* pop queued item */ LIST_POP(&records.queue, tmp); /* collect record to process */ pos = NODE2RECORD(tmp, queue); /* process the folder and add all folder data to current list */ dir = opendir(pos->name); if (!dir) continue; while ((entry = readdir(dir)) != NULL) { /* create new item, copying it's path data and unique ID */ item = malloc(sizeof(*item) + pos->len + entry->d_namlen + 2); *item = RECORD_INIT(*item); item->len = pos->len + entry->d_namlen; memcpy(item->name, pos->name, pos->len); memcpy(item->name + pos->len, entry->d_name, entry->d_namlen); item->name[item->len] = 0; item->ino = entry->d_ino; /* add item to the list, right after the `pos` item */ LIST_Push(&pos->list, &item->list); /* unless it's a folder, we're done. */ if (entry->d_type != DT_DIR) continue; /* test for '.' and '..' */ if (entry->d_name[0] == '.' && (entry->d_name[1] == 0 || (entry->d_name[1] == '.' && entry->d_name[2] == 0))) continue; /* add folder marker */ item->name[item->len++] = '/'; item->name[item->len] = 0; /* item is a new folder, add to queue */ LIST_Push(&records.queue, &item->queue); } closedir(dir); } /*************** Printing the results and cleaning up ***************/ while (records.list.next != &records.list) { /* pop list item */ LIST_POP(&records.list, tmp); /* collect record to process */ pos = NODE2RECORD(tmp, list); /* prepare for next iteration */ LIST_POP(&records.list, tmp); fwrite(pos->name, pos->len, 1, stderr); fwrite("
", 1, 1, stderr); free(pos); } return 0; }

chqrlie · Answer

再帰的ですが、使用するスタックスペースがはるかに少ない簡易バージョンを次に示します。

#include <errno.h> #include <stdio.h> #include <string.h> #include <sys/types.h> #include <unistd.h> #include <dirent.h> void listdir(char *path, size_t size) { DIR *dir; struct dirent *entry; size_t len = strlen(path); if (!(dir = opendir(path))) { fprintf(stderr, "path not found: %s: %s
", path, strerror(errno)); return; } puts(path); while ((entry = readdir(dir)) != NULL) { char *name = entry->d_name; if (entry->d_type == DT_DIR) { if (!strcmp(name, ".") || !strcmp(name, "..")) continue; if (len + strlen(name) + 2 > size) { fprintf(stderr, "path too long: %s/%s
", path, name); } else { path[len] = '/'; strcpy(path + len + 1, name); listdir(path, size); path[len] = '\0'; } } else { printf("%s/%s
", path, name); } } closedir(dir); } int main(void) { char path[1024] = "."; listdir(path, sizeof path); return 0; }

私のシステムでは、その出力はfind .の出力とまったく同じです。