From c344ca95f6231ee1501b1e25c49ea7e87712ca1d Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Mon, 23 Mar 2026 11:39:47 +0800 Subject: [PATCH 1/3] feat(modelfile): add missing ModelScope LFS file patterns to auto-generate Add ML-specific file extensions from ModelScope's LFS tracking list that were missing from ModelFilePatterns, preventing correct auto-classification during `modctl modelfile generate`. Also add sharded/variant wildcard patterns for bin, gguf, and llamafile formats. New patterns: *.arrow, *.parquet, *.ftz, *.ark, *.tfevents*, *.db, *.bin.*, *.gguf.*, *.llamafile.* Co-Authored-By: Claude Opus 4.6 Signed-off-by: Zhao Chen --- pkg/modelfile/constants.go | 15 +++++++++++-- pkg/modelfile/constants_test.go | 38 +++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/pkg/modelfile/constants.go b/pkg/modelfile/constants.go index 68d8663..9f58862 100644 --- a/pkg/modelfile/constants.go +++ b/pkg/modelfile/constants.go @@ -63,7 +63,8 @@ var ( "*.safetensors", // Safe and efficient tensor serialization format // PyTorch formats. - "*.bin", // General binary format + "*.bin", // General binary format + "*.bin.*", // Sharded binary files (e.g., model.bin.1) "*.pt", // PyTorch model "*.pth", // PyTorch model (alternative extension) "*.mar", // PyTorch Model Archive @@ -82,7 +83,8 @@ var ( "*.index", // TensorFlow checkpoint index // GGML formats. - "*.gguf", // GGML Universal Format + "*.gguf", // GGML Universal Format + "*.gguf.*", // Partitioned GGUF files "*.ggml", // GGML format (legacy) "*.ggmf", // GGMF format (deprecated) "*.ggjt", // GGJT format (deprecated) @@ -125,11 +127,20 @@ var ( "*.mleap", // MLeap format (Spark ML) "*.surml", // SurrealML format "*.llamafile", // Llamafile format + "*.llamafile.*", // Llamafile variants "*.caffemodel", // Caffe model format "*.prototxt", // Caffe model definition "*.dlc", // Qualcomm Deep Learning Container "*.circle", // Samsung Circle format "*.nb", // Neural Network Binary format + + // Data and dataset formats. + "*.arrow", // Apache Arrow columnar format + "*.parquet", // Apache Parquet columnar format + "*.ftz", // FastText compressed model + "*.ark", // Kaldi ark format (speech/audio models) + "*.tfevents*", // TensorBoard event files + "*.db", // Database files (LMDB, etc.) } // Code file patterns - supported script and notebook files. diff --git a/pkg/modelfile/constants_test.go b/pkg/modelfile/constants_test.go index e513f50..3702c29 100644 --- a/pkg/modelfile/constants_test.go +++ b/pkg/modelfile/constants_test.go @@ -32,6 +32,44 @@ func TestIsFileType(t *testing.T) { } } +func TestIsFileTypeModelPatterns(t *testing.T) { + testCases := []struct { + filename string + expected bool + }{ + // New data/dataset formats. + {"dataset.arrow", true}, + {"train.parquet", true}, + {"model.ftz", true}, + {"feats.ark", true}, + {"events.out.tfevents.1679012345.hostname", true}, // *.tfevents* matches via filepath.Match (wildcards match dots) + {"training.db", true}, + + // Sharded/variant patterns. + {"model.bin.1", true}, + {"model.bin.part2", true}, + {"model.gguf.part1", true}, + {"model.gguf.00001-of-00003", true}, + {"model.llamafile.zip", true}, + {"model.llamafile.gz", true}, + + // Existing patterns still work. + {"model.safetensors", true}, + {"model.bin", true}, + {"model.gguf", true}, + {"model.llamafile", true}, + + // Non-matching files. + {"readme.txt", false}, + {"script.py", false}, + } + + assert := assert.New(t) + for _, tc := range testCases { + assert.Equal(tc.expected, IsFileType(tc.filename, ModelFilePatterns), "filename: %s", tc.filename) + } +} + func TestIsSkippable(t *testing.T) { testCases := []struct { filename string From 58416ca171dd4b05c1723a909db2d0fbf419ce85 Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Mon, 23 Mar 2026 11:59:29 +0800 Subject: [PATCH 2/3] refactor(modelfile): move *.tfevents* from ModelFilePatterns to DocFilePatterns Address code review feedback: TensorBoard event files are training logs/metrics, not model weights. They belong in DocFilePatterns alongside *.log, not in ModelFilePatterns. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Zhao Chen --- pkg/modelfile/constants.go | 2 +- pkg/modelfile/constants_test.go | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/pkg/modelfile/constants.go b/pkg/modelfile/constants.go index 9f58862..bdf16b9 100644 --- a/pkg/modelfile/constants.go +++ b/pkg/modelfile/constants.go @@ -139,7 +139,6 @@ var ( "*.parquet", // Apache Parquet columnar format "*.ftz", // FastText compressed model "*.ark", // Kaldi ark format (speech/audio models) - "*.tfevents*", // TensorBoard event files "*.db", // Database files (LMDB, etc.) } @@ -305,6 +304,7 @@ var ( "SETUP*", // Setup instructions "*requirements*", // Dependency specifications "*.log", // Log files + "*.tfevents*", // TensorBoard event files // Office documents "*.doc", // Microsoft Word 97-2003 Document diff --git a/pkg/modelfile/constants_test.go b/pkg/modelfile/constants_test.go index 3702c29..cabaca4 100644 --- a/pkg/modelfile/constants_test.go +++ b/pkg/modelfile/constants_test.go @@ -42,7 +42,6 @@ func TestIsFileTypeModelPatterns(t *testing.T) { {"train.parquet", true}, {"model.ftz", true}, {"feats.ark", true}, - {"events.out.tfevents.1679012345.hostname", true}, // *.tfevents* matches via filepath.Match (wildcards match dots) {"training.db", true}, // Sharded/variant patterns. @@ -62,6 +61,7 @@ func TestIsFileTypeModelPatterns(t *testing.T) { // Non-matching files. {"readme.txt", false}, {"script.py", false}, + {"events.out.tfevents.1679012345.hostname", false}, // tfevents moved to DocFilePatterns } assert := assert.New(t) @@ -70,6 +70,22 @@ func TestIsFileTypeModelPatterns(t *testing.T) { } } +func TestIsFileTypeDocPatternsTfevents(t *testing.T) { + testCases := []struct { + filename string + expected bool + }{ + {"events.out.tfevents.1679012345.hostname", true}, // *.tfevents* matches via filepath.Match (wildcards match dots) + {"train.tfevents", true}, + {"model.safetensors", false}, // model files should not match doc patterns + } + + assert := assert.New(t) + for _, tc := range testCases { + assert.Equal(tc.expected, IsFileType(tc.filename, DocFilePatterns), "filename: %s", tc.filename) + } +} + func TestIsSkippable(t *testing.T) { testCases := []struct { filename string From 5afca985b5549164220044129fa6e5da4212118d Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Mon, 23 Mar 2026 15:47:55 +0800 Subject: [PATCH 3/3] chore: trigger CI re-run Signed-off-by: Zhao Chen