Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 41 additions & 18 deletions src/discover/discover.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@

#include "foundation/constants.h"
#include "foundation/compat_fs.h"
#ifdef _WIN32
#include "foundation/win_utf8.h"
#endif
#include <stdint.h> // int64_t
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // strdup
#include <sys/stat.h>

/* ── Hardcoded always-skip directories ──────────────────────────── */
/* ── Hardcoded always-skip directories ──────────────────────────── */

static const char *ALWAYS_SKIP_DIRS[] = {
/* VCS */
Expand Down Expand Up @@ -52,7 +55,7 @@ static const char *FAST_SKIP_DIRS[] = {
"locale", "locales", "i18n", "l10n", "scripts", "tools",
"hack", "bin", "build", "out", NULL};

/* ── Ignored suffixes ────────────────────────────────────────────── */
/* ── Ignored suffixes ───────────────────────────────── */

static const char *ALWAYS_IGNORED_SUFFIXES[] = {
".tmp", "~", ".pyc", ".pyo", ".o", ".a", ".so", ".dll",
Expand All @@ -68,7 +71,7 @@ static const char *FAST_IGNORED_SUFFIXES[] = {
".crt", ".key", ".cer", ".p12", ".pb", ".avro", ".parquet", ".beam",
".elc", ".rlib", ".coverage", ".prof", ".out", ".patch", ".diff", NULL};

/* ── Fast-mode skip filenames ────────────────────────────────────── */
/* ── Fast-mode skip filenames ─────────────────────── */

static const char *FAST_SKIP_FILENAMES[] = {
"LICENSE", "LICENSE.txt", "LICENSE.md", "LICENSE-MIT", "LICENSE-APACHE",
Expand All @@ -79,14 +82,14 @@ static const char *FAST_SKIP_FILENAMES[] = {
"mix.lock", "flake.lock", "pubspec.lock", "composer.lock", "package-lock.json",
"configure", "Makefile.in", "config.guess", "config.sub", NULL};

/* ── Fast-mode substring patterns ────────────────────────────────── */
/* ── Fast-mode substring patterns ───────────────────── */

static const char *FAST_PATTERNS[] = {".d.ts", ".bundle.", ".chunk.", ".generated.",
".pb.go", "_pb2.py", ".pb2.py", "_grpc.pb.go",
"_string.go", "mock_", "_mock.", "_test_helpers.",
".stories.", ".spec.", ".test.", NULL};

/* ── Ignored JSON filenames ──────────────────────────────────────── */
/* ── Ignored JSON filenames ──────────────────────── */

static const char *IGNORED_JSON_FILES[] = {
"package.json", "package-lock.json", "tsconfig.json",
Expand All @@ -111,7 +114,7 @@ static bool str_in_list(const char *s, const char *const *list) {
return false;
}

/* ── Helper: check if string ends with suffix ────────────────────── */
/* ── Helper: check if string ends with suffix ────────────── */

static bool ends_with(const char *s, const char *suffix) {
size_t slen = strlen(s);
Expand All @@ -122,13 +125,13 @@ static bool ends_with(const char *s, const char *suffix) {
return strcmp(s + slen - sufflen, suffix) == 0;
}

/* ── Helper: check if string contains substring ──────────────────── */
/* ── Helper: check if string contains substring ───────────── */

static bool str_contains(const char *s, const char *sub) {
return strstr(s, sub) != NULL;
}

/* ── Public filter functions ─────────────────────────────────────── */
/* ── Public filter functions ─────────────────────── */

bool cbm_should_skip_dir(const char *dirname, cbm_index_mode_t mode) {
if (!dirname) {
Expand Down Expand Up @@ -199,7 +202,7 @@ bool cbm_matches_fast_pattern(const char *filename, cbm_index_mode_t mode) {
return false;
}

/* ── Dynamic file list ───────────────────────────────────────────── */
/* ── Dynamic file list ────────────────────────── */

typedef struct {
cbm_file_info_t *files;
Expand All @@ -226,7 +229,7 @@ static void fl_add(file_list_t *fl, const char *abs_path, const char *rel_path,
fi->size = size;
}

/* ── Recursive walk ──────────────────────────────────────────────── */
/* ── Recursive walk ─────────────────────────────── */

/* Compute path relative to a nested .gitignore's directory.
* "webapp/src/foo.js" with prefix "webapp" → "src/foo.js". */
Expand Down Expand Up @@ -315,21 +318,41 @@ static CBMLanguage detect_file_language(const char *entry_name, const char *abs_
return lang;
}

/* Stat a path, skipping symlinks. Returns 0 on success, -1 to skip. */
static int safe_stat(const char *abs_path, struct stat *st) {
/* UTF-8-safe stat: wide API on Windows, regular stat on POSIX. */
static int wide_stat(const char *path, struct stat *st) {
#ifdef _WIN32
if (stat(abs_path, st) != 0) {
wchar_t *wpath = cbm_utf8_to_wide(path);
if (!wpath) {
return CBM_NOT_FOUND;
}
struct _stat64 wst;
int ret = _wstat64(wpath, &wst);
free(wpath);
if (ret != 0) {
return CBM_NOT_FOUND;
}
st->st_mode = wst.st_mode;
st->st_size = wst.st_size;
st->st_mtime = wst.st_mtime;
return 0;
#else
return stat(path, st);
#endif
}

/* Stat a path, skipping symlinks. Returns 0 on success, -1 to skip. */
static int safe_stat(const char *abs_path, struct stat *st) {
#ifdef _WIN32
return wide_stat(abs_path, st);
#else
if (lstat(abs_path, st) != 0) {
return CBM_NOT_FOUND;
}
if (S_ISLNK(st->st_mode)) {
return CBM_NOT_FOUND;
}
#endif
return 0;
#endif
}

/* Process a single regular file entry during directory walk. */
Expand Down Expand Up @@ -364,7 +387,7 @@ static cbm_gitignore_t *try_load_nested_gitignore(const walk_frame_t *frame) {
char gi_path[CBM_SZ_4K];
snprintf(gi_path, sizeof(gi_path), "%s/.gitignore", frame->dir);
struct stat gi_st;
if (stat(gi_path, &gi_st) == 0 && S_ISREG(gi_st.st_mode)) {
if (wide_stat(gi_path, &gi_st) == 0 && S_ISREG(gi_st.st_mode)) {
return cbm_gitignore_load(gi_path);
}
return NULL;
Expand Down Expand Up @@ -461,7 +484,7 @@ static void walk_dir(const char *dir_path, const char *rel_prefix, const cbm_dis
free(stack);
}

/* ── Public API ──────────────────────────────────────────────────── */
/* ── Public API ───────────────────────────────── */

int cbm_discover(const char *repo_path, const cbm_discover_opts_t *opts, cbm_file_info_t **out,
int *count) {
Expand All @@ -474,7 +497,7 @@ int cbm_discover(const char *repo_path, const cbm_discover_opts_t *opts, cbm_fil

/* Verify directory exists */
struct stat st;
if (stat(repo_path, &st) != 0 || !S_ISDIR(st.st_mode)) {
if (wide_stat(repo_path, &st) != 0 || !S_ISDIR(st.st_mode)) {
return CBM_NOT_FOUND;
}

Expand All @@ -483,7 +506,7 @@ int cbm_discover(const char *repo_path, const cbm_discover_opts_t *opts, cbm_fil
char gi_path[CBM_SZ_4K];
snprintf(gi_path, sizeof(gi_path), "%s/.git", repo_path);
struct stat gi_stat;
if (stat(gi_path, &gi_stat) == 0 && S_ISDIR(gi_stat.st_mode)) {
if (wide_stat(gi_path, &gi_stat) == 0 && S_ISDIR(gi_stat.st_mode)) {
snprintf(gi_path, sizeof(gi_path), "%s/.gitignore", repo_path);
gitignore = cbm_gitignore_load(gi_path);
}
Expand Down
114 changes: 75 additions & 39 deletions src/foundation/compat_fs.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,20 @@

#ifdef _WIN32

/* ── Windows implementation ───────────────────────────────────── */
/* ── Windows implementation ────────────────────────────────── */

#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
#include <direct.h> /* _mkdir */
#include <io.h> /* _unlink */
#include <direct.h> /* _wmkdir */
#include <io.h> /* _wunlink */
#include "foundation/win_utf8.h"

struct cbm_dir {
HANDLE find_handle;
WIN32_FIND_DATAA find_data;
WIN32_FIND_DATAW find_data;
wchar_t wide_pattern[CBM_PATH_MAX];
cbm_dirent_t entry;
bool first;
bool done;
Expand All @@ -34,27 +36,36 @@ cbm_dir_t *cbm_opendir(const char *path) {
if (!path) {
return NULL;
}
/* Build search pattern: "path\*" */
size_t len = strlen(path);
char *pattern = (char *)malloc(len + 3);
if (!pattern) {
wchar_t *wpath = cbm_utf8_to_wide(path);
if (!wpath) {
return NULL;
}
memcpy(pattern, path, len);
if (len > 0 && path[len - SKIP_ONE] != '\\' && path[len - SKIP_ONE] != '/') {
pattern[len++] = '\\';

size_t wlen = wcslen(wpath);
if (wlen == 0 || wlen + 2 >= CBM_PATH_MAX) {
free(wpath);
return NULL;
}
pattern[len++] = '*';
pattern[len] = '\0';

cbm_dir_t *d = (cbm_dir_t *)calloc(CBM_ALLOC_ONE, sizeof(cbm_dir_t));
if (!d) {
free(pattern);
free(wpath);
return NULL;
}

d->find_handle = FindFirstFileA(pattern, &d->find_data);
free(pattern);
wmemcpy(d->wide_pattern, wpath, wlen + 1);
wchar_t *p = d->wide_pattern + wlen - SKIP_ONE;
if (*p != L'\\' && *p != L'/') {
++p;
*p++ = L'\\';
} else {
++p;
}
*p++ = L'*';
*p = L'\0';
free(wpath);

d->find_handle = FindFirstFileW(d->wide_pattern, &d->find_data);
if (d->find_handle == INVALID_HANDLE_VALUE) {
free(d);
return NULL;
Expand All @@ -69,31 +80,36 @@ cbm_dirent_t *cbm_readdir(cbm_dir_t *d) {
return NULL;
}
if (!d->first) {
if (!FindNextFileA(d->find_handle, &d->find_data)) {
if (!FindNextFileW(d->find_handle, &d->find_data)) {
d->done = true;
return NULL;
}
}
d->first = false;

/* Skip "." and ".." */
while (d->find_data.cFileName[0] == '.' &&
(d->find_data.cFileName[1] == '\0' ||
(d->find_data.cFileName[1] == '.' && d->find_data.cFileName[2] == '\0'))) {
if (!FindNextFileA(d->find_handle, &d->find_data)) {
while (d->find_data.cFileName[0] == L'.' &&
(d->find_data.cFileName[1] == L'\0' ||
(d->find_data.cFileName[1] == L'.' && d->find_data.cFileName[2] == L'\0'))) {
if (!FindNextFileW(d->find_handle, &d->find_data)) {
d->done = true;
return NULL;
}
}

size_t nlen = strlen(d->find_data.cFileName);
char *u8 = cbm_wide_to_utf8(d->find_data.cFileName);
if (!u8) {
d->done = true;
return NULL;
}
size_t nlen = strlen(u8);
if (nlen >= CBM_DIRENT_NAME_MAX) {
nlen = CBM_DIRENT_NAME_MAX - SKIP_ONE;
}
memcpy(d->entry.name, d->find_data.cFileName, nlen);
memcpy(d->entry.name, u8, nlen);
d->entry.name[nlen] = '\0';
free(u8);
d->entry.is_dir = (d->find_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0;
d->entry.d_type = 0; /* Not meaningful on Windows */
d->entry.d_type = 0;
return &d->entry;
}

Expand All @@ -115,34 +131,54 @@ int cbm_pclose(FILE *f) {
}

bool cbm_mkdir_p(const char *path, int mode) {
(void)mode; /* Windows ignores POSIX permissions */
/* Simple recursive mkdir: try creating, if fail walk parents */
if (_mkdir(path) == 0) {
(void)mode;
wchar_t *wpath = cbm_utf8_to_wide(path);
if (!wpath) {
return false;
}

if (_wmkdir(wpath) == 0) {
free(wpath);
return true;
}
/* Walk path and create each component */
char *tmp = _strdup(path);
size_t wlen = wcslen(wpath);
wchar_t *tmp = (wchar_t *)malloc((wlen + 1) * sizeof(wchar_t));
if (!tmp) {
free(wpath);
return false;
}
for (char *p = tmp + SKIP_ONE; *p; p++) {
if (*p == '/' || *p == '\\') {
*p = '\0';
_mkdir(tmp); /* ignore errors for intermediate dirs */
*p = '\\';
wmemcpy(tmp, wpath, wlen + 1);
for (wchar_t *p = tmp + SKIP_ONE; *p; p++) {
if (*p == L'/' || *p == L'\\') {
*p = L'\0';
_wmkdir(tmp);
*p = L'\\';
}
}
bool ok = _mkdir(tmp) == 0 || GetLastError() == ERROR_ALREADY_EXISTS;
bool ok = _wmkdir(tmp) == 0 || GetLastError() == ERROR_ALREADY_EXISTS;
free(tmp);
free(wpath);
return ok;
}

int cbm_unlink(const char *path) {
return _unlink(path);
wchar_t *wpath = cbm_utf8_to_wide(path);
if (!wpath) {
return CBM_NOT_FOUND;
}
int ret = _wunlink(wpath);
free(wpath);
return ret;
}

int cbm_rmdir(const char *path) {
return _rmdir(path);
wchar_t *wpath = cbm_utf8_to_wide(path);
if (!wpath) {
return CBM_NOT_FOUND;
}
int ret = _wrmdir(wpath);
free(wpath);
return ret;
}

int cbm_exec_no_shell(const char *const *argv) {
Expand All @@ -154,7 +190,7 @@ int cbm_exec_no_shell(const char *const *argv) {

#else /* POSIX */

/* ── POSIX implementation ─────────────────────────────────────── */
/* ── POSIX implementation ────────────────────────────────── */

#include <dirent.h>
#include <errno.h>
Expand Down
Loading