diff --git a/src/discover/discover.c b/src/discover/discover.c index b1e49ae8..4e059b76 100644 --- a/src/discover/discover.c +++ b/src/discover/discover.c @@ -13,13 +13,16 @@ #include "foundation/constants.h" #include "foundation/compat_fs.h" +#ifdef _WIN32 +#include "foundation/win_utf8.h" +#endif #include // int64_t #include #include #include // strdup #include -/* ── Hardcoded always-skip directories ───────────────────────────── */ +/* ── Hardcoded always-skip directories ──────────────────────────── */ static const char *ALWAYS_SKIP_DIRS[] = { /* VCS */ @@ -52,7 +55,7 @@ static const char *FAST_SKIP_DIRS[] = { "locale", "locales", "i18n", "l10n", "scripts", "tools", "hack", "bin", "build", "out", NULL}; -/* ── Ignored suffixes ────────────────────────────────────────────── */ +/* ── Ignored suffixes ───────────────────────────────── */ static const char *ALWAYS_IGNORED_SUFFIXES[] = { ".tmp", "~", ".pyc", ".pyo", ".o", ".a", ".so", ".dll", @@ -68,7 +71,7 @@ static const char *FAST_IGNORED_SUFFIXES[] = { ".crt", ".key", ".cer", ".p12", ".pb", ".avro", ".parquet", ".beam", ".elc", ".rlib", ".coverage", ".prof", ".out", ".patch", ".diff", NULL}; -/* ── Fast-mode skip filenames ────────────────────────────────────── */ +/* ── Fast-mode skip filenames ─────────────────────── */ static const char *FAST_SKIP_FILENAMES[] = { "LICENSE", "LICENSE.txt", "LICENSE.md", "LICENSE-MIT", "LICENSE-APACHE", @@ -79,14 +82,14 @@ static const char *FAST_SKIP_FILENAMES[] = { "mix.lock", "flake.lock", "pubspec.lock", "composer.lock", "package-lock.json", "configure", "Makefile.in", "config.guess", "config.sub", NULL}; -/* ── Fast-mode substring patterns ────────────────────────────────── */ +/* ── Fast-mode substring patterns ───────────────────── */ static const char *FAST_PATTERNS[] = {".d.ts", ".bundle.", ".chunk.", ".generated.", ".pb.go", "_pb2.py", ".pb2.py", "_grpc.pb.go", "_string.go", "mock_", "_mock.", "_test_helpers.", ".stories.", ".spec.", ".test.", NULL}; -/* ── Ignored JSON filenames ──────────────────────────────────────── */ +/* ── Ignored JSON filenames ──────────────────────── */ static const char *IGNORED_JSON_FILES[] = { "package.json", "package-lock.json", "tsconfig.json", @@ -111,7 +114,7 @@ static bool str_in_list(const char *s, const char *const *list) { return false; } -/* ── Helper: check if string ends with suffix ────────────────────── */ +/* ── Helper: check if string ends with suffix ────────────── */ static bool ends_with(const char *s, const char *suffix) { size_t slen = strlen(s); @@ -122,13 +125,13 @@ static bool ends_with(const char *s, const char *suffix) { return strcmp(s + slen - sufflen, suffix) == 0; } -/* ── Helper: check if string contains substring ──────────────────── */ +/* ── Helper: check if string contains substring ───────────── */ static bool str_contains(const char *s, const char *sub) { return strstr(s, sub) != NULL; } -/* ── Public filter functions ─────────────────────────────────────── */ +/* ── Public filter functions ─────────────────────── */ bool cbm_should_skip_dir(const char *dirname, cbm_index_mode_t mode) { if (!dirname) { @@ -199,7 +202,7 @@ bool cbm_matches_fast_pattern(const char *filename, cbm_index_mode_t mode) { return false; } -/* ── Dynamic file list ───────────────────────────────────────────── */ +/* ── Dynamic file list ────────────────────────── */ typedef struct { cbm_file_info_t *files; @@ -226,7 +229,7 @@ static void fl_add(file_list_t *fl, const char *abs_path, const char *rel_path, fi->size = size; } -/* ── Recursive walk ──────────────────────────────────────────────── */ +/* ── Recursive walk ─────────────────────────────── */ /* Compute path relative to a nested .gitignore's directory. * "webapp/src/foo.js" with prefix "webapp" → "src/foo.js". */ @@ -315,12 +318,32 @@ static CBMLanguage detect_file_language(const char *entry_name, const char *abs_ return lang; } -/* Stat a path, skipping symlinks. Returns 0 on success, -1 to skip. */ -static int safe_stat(const char *abs_path, struct stat *st) { +/* UTF-8-safe stat: wide API on Windows, regular stat on POSIX. */ +static int wide_stat(const char *path, struct stat *st) { #ifdef _WIN32 - if (stat(abs_path, st) != 0) { + wchar_t *wpath = cbm_utf8_to_wide(path); + if (!wpath) { return CBM_NOT_FOUND; } + struct _stat64 wst; + int ret = _wstat64(wpath, &wst); + free(wpath); + if (ret != 0) { + return CBM_NOT_FOUND; + } + st->st_mode = wst.st_mode; + st->st_size = wst.st_size; + st->st_mtime = wst.st_mtime; + return 0; +#else + return stat(path, st); +#endif +} + +/* Stat a path, skipping symlinks. Returns 0 on success, -1 to skip. */ +static int safe_stat(const char *abs_path, struct stat *st) { +#ifdef _WIN32 + return wide_stat(abs_path, st); #else if (lstat(abs_path, st) != 0) { return CBM_NOT_FOUND; @@ -328,8 +351,8 @@ static int safe_stat(const char *abs_path, struct stat *st) { if (S_ISLNK(st->st_mode)) { return CBM_NOT_FOUND; } -#endif return 0; +#endif } /* Process a single regular file entry during directory walk. */ @@ -364,7 +387,7 @@ static cbm_gitignore_t *try_load_nested_gitignore(const walk_frame_t *frame) { char gi_path[CBM_SZ_4K]; snprintf(gi_path, sizeof(gi_path), "%s/.gitignore", frame->dir); struct stat gi_st; - if (stat(gi_path, &gi_st) == 0 && S_ISREG(gi_st.st_mode)) { + if (wide_stat(gi_path, &gi_st) == 0 && S_ISREG(gi_st.st_mode)) { return cbm_gitignore_load(gi_path); } return NULL; @@ -461,7 +484,7 @@ static void walk_dir(const char *dir_path, const char *rel_prefix, const cbm_dis free(stack); } -/* ── Public API ──────────────────────────────────────────────────── */ +/* ── Public API ───────────────────────────────── */ int cbm_discover(const char *repo_path, const cbm_discover_opts_t *opts, cbm_file_info_t **out, int *count) { @@ -474,7 +497,7 @@ int cbm_discover(const char *repo_path, const cbm_discover_opts_t *opts, cbm_fil /* Verify directory exists */ struct stat st; - if (stat(repo_path, &st) != 0 || !S_ISDIR(st.st_mode)) { + if (wide_stat(repo_path, &st) != 0 || !S_ISDIR(st.st_mode)) { return CBM_NOT_FOUND; } @@ -483,7 +506,7 @@ int cbm_discover(const char *repo_path, const cbm_discover_opts_t *opts, cbm_fil char gi_path[CBM_SZ_4K]; snprintf(gi_path, sizeof(gi_path), "%s/.git", repo_path); struct stat gi_stat; - if (stat(gi_path, &gi_stat) == 0 && S_ISDIR(gi_stat.st_mode)) { + if (wide_stat(gi_path, &gi_stat) == 0 && S_ISDIR(gi_stat.st_mode)) { snprintf(gi_path, sizeof(gi_path), "%s/.gitignore", repo_path); gitignore = cbm_gitignore_load(gi_path); } diff --git a/src/foundation/compat_fs.c b/src/foundation/compat_fs.c index c97897b3..f77ad9a8 100644 --- a/src/foundation/compat_fs.c +++ b/src/foundation/compat_fs.c @@ -13,18 +13,20 @@ #ifdef _WIN32 -/* ── Windows implementation ───────────────────────────────────── */ +/* ── Windows implementation ────────────────────────────────── */ #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif #include -#include /* _mkdir */ -#include /* _unlink */ +#include /* _wmkdir */ +#include /* _wunlink */ +#include "foundation/win_utf8.h" struct cbm_dir { HANDLE find_handle; - WIN32_FIND_DATAA find_data; + WIN32_FIND_DATAW find_data; + wchar_t wide_pattern[CBM_PATH_MAX]; cbm_dirent_t entry; bool first; bool done; @@ -34,27 +36,36 @@ cbm_dir_t *cbm_opendir(const char *path) { if (!path) { return NULL; } - /* Build search pattern: "path\*" */ - size_t len = strlen(path); - char *pattern = (char *)malloc(len + 3); - if (!pattern) { + wchar_t *wpath = cbm_utf8_to_wide(path); + if (!wpath) { return NULL; } - memcpy(pattern, path, len); - if (len > 0 && path[len - SKIP_ONE] != '\\' && path[len - SKIP_ONE] != '/') { - pattern[len++] = '\\'; + + size_t wlen = wcslen(wpath); + if (wlen == 0 || wlen + 2 >= CBM_PATH_MAX) { + free(wpath); + return NULL; } - pattern[len++] = '*'; - pattern[len] = '\0'; cbm_dir_t *d = (cbm_dir_t *)calloc(CBM_ALLOC_ONE, sizeof(cbm_dir_t)); if (!d) { - free(pattern); + free(wpath); return NULL; } - d->find_handle = FindFirstFileA(pattern, &d->find_data); - free(pattern); + wmemcpy(d->wide_pattern, wpath, wlen + 1); + wchar_t *p = d->wide_pattern + wlen - SKIP_ONE; + if (*p != L'\\' && *p != L'/') { + ++p; + *p++ = L'\\'; + } else { + ++p; + } + *p++ = L'*'; + *p = L'\0'; + free(wpath); + + d->find_handle = FindFirstFileW(d->wide_pattern, &d->find_data); if (d->find_handle == INVALID_HANDLE_VALUE) { free(d); return NULL; @@ -69,31 +80,36 @@ cbm_dirent_t *cbm_readdir(cbm_dir_t *d) { return NULL; } if (!d->first) { - if (!FindNextFileA(d->find_handle, &d->find_data)) { + if (!FindNextFileW(d->find_handle, &d->find_data)) { d->done = true; return NULL; } } d->first = false; - /* Skip "." and ".." */ - while (d->find_data.cFileName[0] == '.' && - (d->find_data.cFileName[1] == '\0' || - (d->find_data.cFileName[1] == '.' && d->find_data.cFileName[2] == '\0'))) { - if (!FindNextFileA(d->find_handle, &d->find_data)) { + while (d->find_data.cFileName[0] == L'.' && + (d->find_data.cFileName[1] == L'\0' || + (d->find_data.cFileName[1] == L'.' && d->find_data.cFileName[2] == L'\0'))) { + if (!FindNextFileW(d->find_handle, &d->find_data)) { d->done = true; return NULL; } } - size_t nlen = strlen(d->find_data.cFileName); + char *u8 = cbm_wide_to_utf8(d->find_data.cFileName); + if (!u8) { + d->done = true; + return NULL; + } + size_t nlen = strlen(u8); if (nlen >= CBM_DIRENT_NAME_MAX) { nlen = CBM_DIRENT_NAME_MAX - SKIP_ONE; } - memcpy(d->entry.name, d->find_data.cFileName, nlen); + memcpy(d->entry.name, u8, nlen); d->entry.name[nlen] = '\0'; + free(u8); d->entry.is_dir = (d->find_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0; - d->entry.d_type = 0; /* Not meaningful on Windows */ + d->entry.d_type = 0; return &d->entry; } @@ -115,34 +131,54 @@ int cbm_pclose(FILE *f) { } bool cbm_mkdir_p(const char *path, int mode) { - (void)mode; /* Windows ignores POSIX permissions */ - /* Simple recursive mkdir: try creating, if fail walk parents */ - if (_mkdir(path) == 0) { + (void)mode; + wchar_t *wpath = cbm_utf8_to_wide(path); + if (!wpath) { + return false; + } + + if (_wmkdir(wpath) == 0) { + free(wpath); return true; } - /* Walk path and create each component */ - char *tmp = _strdup(path); + size_t wlen = wcslen(wpath); + wchar_t *tmp = (wchar_t *)malloc((wlen + 1) * sizeof(wchar_t)); if (!tmp) { + free(wpath); return false; } - for (char *p = tmp + SKIP_ONE; *p; p++) { - if (*p == '/' || *p == '\\') { - *p = '\0'; - _mkdir(tmp); /* ignore errors for intermediate dirs */ - *p = '\\'; + wmemcpy(tmp, wpath, wlen + 1); + for (wchar_t *p = tmp + SKIP_ONE; *p; p++) { + if (*p == L'/' || *p == L'\\') { + *p = L'\0'; + _wmkdir(tmp); + *p = L'\\'; } } - bool ok = _mkdir(tmp) == 0 || GetLastError() == ERROR_ALREADY_EXISTS; + bool ok = _wmkdir(tmp) == 0 || GetLastError() == ERROR_ALREADY_EXISTS; free(tmp); + free(wpath); return ok; } int cbm_unlink(const char *path) { - return _unlink(path); + wchar_t *wpath = cbm_utf8_to_wide(path); + if (!wpath) { + return CBM_NOT_FOUND; + } + int ret = _wunlink(wpath); + free(wpath); + return ret; } int cbm_rmdir(const char *path) { - return _rmdir(path); + wchar_t *wpath = cbm_utf8_to_wide(path); + if (!wpath) { + return CBM_NOT_FOUND; + } + int ret = _wrmdir(wpath); + free(wpath); + return ret; } int cbm_exec_no_shell(const char *const *argv) { @@ -154,7 +190,7 @@ int cbm_exec_no_shell(const char *const *argv) { #else /* POSIX */ -/* ── POSIX implementation ─────────────────────────────────────── */ +/* ── POSIX implementation ────────────────────────────────── */ #include #include diff --git a/src/foundation/platform.c b/src/foundation/platform.c index d9b7cbdf..6b51fcfe 100644 --- a/src/foundation/platform.c +++ b/src/foundation/platform.c @@ -13,7 +13,7 @@ #ifdef _WIN32 -/* ── Windows implementation ───────────────────────────────────── */ +/* ── Windows implementation ────────────────────────────────── */ #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN @@ -21,6 +21,7 @@ #include #include #include +#include "foundation/win_utf8.h" void *cbm_mmap_read(const char *path, size_t *out_size) { if (!path || !out_size) { @@ -28,24 +29,33 @@ void *cbm_mmap_read(const char *path, size_t *out_size) { } *out_size = 0; - HANDLE file = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, + wchar_t *wpath = cbm_utf8_to_wide(path); + if (!wpath) { + return NULL; + } + + HANDLE file = CreateFileW(wpath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); if (file == INVALID_HANDLE_VALUE) { + free(wpath); return NULL; } LARGE_INTEGER sz; if (!GetFileSizeEx(file, &sz) || sz.QuadPart == 0) { CloseHandle(file); + free(wpath); return NULL; } - HANDLE mapping = CreateFileMappingA(file, NULL, PAGE_READONLY, 0, 0, NULL); + HANDLE mapping = CreateFileMappingW(file, NULL, PAGE_READONLY, 0, 0, NULL); if (!mapping) { CloseHandle(file); + free(wpath); return NULL; } void *addr = MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0); CloseHandle(mapping); CloseHandle(file); + free(wpath); if (!addr) { return NULL; } @@ -80,18 +90,34 @@ int cbm_nprocs(void) { } bool cbm_file_exists(const char *path) { - DWORD attr = GetFileAttributesA(path); + wchar_t *wpath = cbm_utf8_to_wide(path); + if (!wpath) { + return false; + } + DWORD attr = GetFileAttributesW(wpath); + free(wpath); return attr != INVALID_FILE_ATTRIBUTES; } bool cbm_is_dir(const char *path) { - DWORD attr = GetFileAttributesA(path); + wchar_t *wpath = cbm_utf8_to_wide(path); + if (!wpath) { + return false; + } + DWORD attr = GetFileAttributesW(wpath); + free(wpath); return attr != INVALID_FILE_ATTRIBUTES && (attr & FILE_ATTRIBUTE_DIRECTORY); } int64_t cbm_file_size(const char *path) { + wchar_t *wpath = cbm_utf8_to_wide(path); + if (!wpath) { + return CBM_NOT_FOUND; + } WIN32_FILE_ATTRIBUTE_DATA fad; - if (!GetFileAttributesExA(path, GetFileExInfoStandard, &fad)) { + BOOL ok = GetFileAttributesExW(wpath, GetFileExInfoStandard, &fad); + free(wpath); + if (!ok) { return CBM_NOT_FOUND; } LARGE_INTEGER sz; @@ -113,7 +139,7 @@ char *cbm_normalize_path_sep(char *path) { #else /* POSIX (macOS + Linux) */ -/* ── POSIX implementation ─────────────────────────────────────── */ +/* ── POSIX implementation ────────────────────────────────── */ #include #include @@ -127,7 +153,7 @@ char *cbm_normalize_path_sep(char *path) { #include #endif -/* ── Memory mapping ────────────────────────────────────────────── */ +/* ── Memory mapping ──────────────────────────── */ void *cbm_mmap_read(const char *path, size_t *out_size) { if (!path || !out_size) { @@ -162,7 +188,7 @@ void cbm_munmap(void *addr, size_t size) { } } -/* ── Timing ────────────────────────────────────────────────────── */ +/* ── Timing ───────────────────────────── */ #ifdef __APPLE__ static mach_timebase_info_data_t timebase_info; @@ -190,7 +216,7 @@ uint64_t cbm_now_ms(void) { return cbm_now_ns() / CBM_USEC_PER_SEC; } -/* ── System info ───────────────────────────────────────────────── */ +/* ── System info ───────────────────────────── */ int cbm_nprocs(void) { #ifdef __APPLE__ @@ -207,7 +233,7 @@ int cbm_nprocs(void) { #endif } -/* ── File system ───────────────────────────────────────────────── */ +/* ── File system ──────────────────────────── */ bool cbm_file_exists(const char *path) { struct stat st; @@ -242,7 +268,7 @@ char *cbm_normalize_path_sep(char *path) { #endif /* _WIN32 */ -/* ── Environment variables ────────────────────────────────────── */ +/* ── Environment variables ──────────────────────────── */ /* Thread-safe getenv: iterates environ directly instead of calling getenv(). * getenv() is flagged by concurrency-mt-unsafe because the returned pointer @@ -278,7 +304,7 @@ const char *cbm_safe_getenv(const char *name, char *buf, size_t buf_sz, const ch return NULL; } -/* ── Home directory (cross-platform) ──────────────────────────── */ +/* ── Home directory (cross-platform) ───────────────────── */ const char *cbm_get_home_dir(void) { static char buf[CBM_SZ_1K]; @@ -300,7 +326,7 @@ const char *cbm_get_home_dir(void) { return NULL; } -/* ── App config directories (cross-platform) ─────────────────── */ +/* ── App config directories (cross-platform) ────────── */ const char *cbm_app_config_dir(void) { static char buf[CBM_SZ_1K]; @@ -355,7 +381,7 @@ const char *cbm_app_local_dir(void) { #endif } -/* ── Cache directory ─────────────────────────────────────────── */ +/* ── Cache directory ────────────────────────── */ const char *cbm_resolve_cache_dir(void) { static char buf[CBM_SZ_1K]; diff --git a/src/foundation/win_utf8.h b/src/foundation/win_utf8.h new file mode 100644 index 00000000..8a3530a5 --- /dev/null +++ b/src/foundation/win_utf8.h @@ -0,0 +1,44 @@ +#ifndef CBM_WIN_UTF8_H +#define CBM_WIN_UTF8_H + +#ifdef _WIN32 + +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include +#include +#include + +static inline wchar_t *cbm_utf8_to_wide(const char *utf8) { + if (!utf8) { + return NULL; + } + int len = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0); + if (len <= 0) { + return NULL; + } + wchar_t *w = (wchar_t *)malloc((size_t)len * sizeof(wchar_t)); + if (w) { + MultiByteToWideChar(CP_UTF8, 0, utf8, -1, w, len); + } + return w; +} + +static inline char *cbm_wide_to_utf8(const wchar_t *wide) { + if (!wide) { + return NULL; + } + int len = WideCharToMultiByte(CP_UTF8, 0, wide, -1, NULL, 0, NULL, NULL); + if (len <= 0) { + return NULL; + } + char *u8 = (char *)malloc((size_t)len); + if (u8) { + WideCharToMultiByte(CP_UTF8, 0, wide, -1, u8, len, NULL, NULL); + } + return u8; +} + +#endif /* _WIN32 */ +#endif /* CBM_WIN_UTF8_H */