Merge branch 'main' into featrue/add_vllm_support

yuhao-zh · yuhao-zh · commit c5edfbc137b2 · 2025-11-12T13:12:06.000+08:00
diff --git a/src/backend/main.py b/src/backend/main.py
@@ -140,6 +140,7 @@ async def serve_index():
         ],
         announce_maddrs=args.announce_maddrs,
         http_port=args.port,
+        use_hfcache=args.use_hfcache,
     )
 
     request_handler.set_scheduler_manage(scheduler_manage)
diff --git a/src/backend/server/scheduler_manage.py b/src/backend/server/scheduler_manage.py
@@ -32,6 +32,7 @@ def __init__(
         host_maddrs: List[str] = [],
         announce_maddrs: List[str] = [],
         http_port: int = 3001,
+        use_hfcache: bool = False,
     ):
         """Initialize the manager with networking bootstrap parameters."""
         self.initial_peers = initial_peers
@@ -40,6 +41,7 @@ def __init__(
         self.host_maddrs = host_maddrs
         self.announce_maddrs = announce_maddrs
         self.http_port = http_port
+        self.use_hfcache = use_hfcache
         self.model_name = None
         self.init_nodes_num = None
         self.scheduler = None
@@ -134,7 +136,7 @@ def _start_scheduler(self, model_name, init_nodes_num):
         self.model_name = model_name
         self.init_nodes_num = init_nodes_num
 
-        model_info = get_model_info(model_name)
+        model_info = get_model_info(model_name, self.use_hfcache)
         self.scheduler = Scheduler(model_info, [], min_nodes_bootstrapping=init_nodes_num)
 
         # Run the scheduler's event/dispatch loops in background so the process
diff --git a/src/backend/server/server_args.py b/src/backend/server/server_args.py
@@ -35,6 +35,12 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--is-local-network", type=bool, default=True, help="Whether to use local network"
     )
+    parser.add_argument(
+        "--use-hfcache",
+        action="store_true",
+        default=False,
+        help="Use local Hugging Face cache only (no network download)",
+    )
 
     parser.add_argument(
         "--gpu-backend",
diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py
@@ -66,7 +66,7 @@
 NODE_JOIN_COMMAND_PUBLIC_NETWORK = """parallax join -s {scheduler_addr} """
 
 
-def get_model_info(model_name):
+def get_model_info(model_name, use_hfcache: bool = False):
     def _load_config_only(name: str) -> dict:
         local_path = Path(name)
         if local_path.exists():
@@ -77,7 +77,9 @@ def _load_config_only(name: str) -> dict:
         # Hugging Face only – download just config.json
         from huggingface_hub import hf_hub_download  # type: ignore
 
-        config_file = hf_hub_download(repo_id=name, filename="config.json")
+        config_file = hf_hub_download(
+            repo_id=name, filename="config.json", local_files_only=use_hfcache
+        )
         with open(config_file, "r") as f:
             return json.load(f)
 
diff --git a/src/parallax/server/executor.py b/src/parallax/server/executor.py
@@ -73,6 +73,7 @@ def __init__(
         dtype: str = "float16",
         # Backend selection
         gpu_backend: str = "sglang",
+        use_hfcache: bool = False,
         # Scheduler Configs
         max_batch_size: Optional[int] = 8,
         max_sequence_length: Optional[int] = None,
@@ -108,6 +109,7 @@ def __init__(
     ):
         # Backend
         self.device = get_current_device()
+        self.use_hfcache = use_hfcache
         logger.debug(f"Executor initializing on device: {self.device}")
         self.backend_type = gpu_backend
 
@@ -150,6 +152,7 @@ def __init__(
                 "tp_rank": tp_rank,
                 "tp_size": tp_size,
                 "nccl_port": nccl_port,
+                "using_hfcache": use_hfcache,
             }
 
             self.model_runner, self.config, self.tokenizer = initialize_cuda_model_runner(
@@ -176,7 +179,10 @@ def __init__(
                 f"Initializing MLX sharded model loader for repo={model_repo}, layers=[{start_layer}, {end_layer})"
             )
             self.shard_loader = MLXModelLoader(
-                model_repo, start_layer=start_layer, end_layer=end_layer
+                model_repo,
+                start_layer=start_layer,
+                end_layer=end_layer,
+                use_hfcache=self.use_hfcache,
             )
             t0 = time.time()
             self.model_shard, self.config, self.tokenizer = self.shard_loader.load()
@@ -1629,5 +1635,6 @@ def create_executor_config(args: argparse.Namespace, gradient_server=None):
         "tp_size": args.tp_size,
         "nccl_port": args.nccl_port,
         "gradient_server": gradient_server,
+        "use_hfcache": args.use_hfcache,
     }
     return config
diff --git a/src/parallax/server/http_server.py b/src/parallax/server/http_server.py
@@ -98,6 +98,7 @@ def __init__(
         executor_input_ipc_name,
         executor_output_ipc_name,
         model_path_str,
+        use_hfcache: bool = False,
     ):
         self.asyncio_tasks = set()
         # Init inter-process communication
@@ -114,9 +115,10 @@ def __init__(
         if Path(model_path_str).exists():
             model_path = Path(model_path_str)
         else:
-            model_path = download_metadata_only(model_path_str)
+            model_path = download_metadata_only(model_path_str, local_files_only=use_hfcache)
         config = load_config(model_path)
         self.model_path_str = model_path_str
+        self.use_hfcache = use_hfcache
         self.tokenizer = load_tokenizer(model_path, eos_token_ids=config.get("eos_token_id", None))
         self.detokenizer_class, self.tokenmap = load_detokenizer(model_path, self.tokenizer)
 
@@ -338,13 +340,18 @@ def create_error_response(
 
 
 async def init_app_states(
-    state: State, executor_input_ipc: str, executor_output_ipc: str, model_path: str
+    state: State,
+    executor_input_ipc: str,
+    executor_output_ipc: str,
+    model_path: str,
+    use_hfcache: bool = False,
 ):
     """Init FastAPI app states, including http handler, etc."""
     state.http_handler = HTTPHandler(
         executor_input_ipc,
         executor_output_ipc,
         model_path,
+        use_hfcache,
     )
 
 
@@ -433,6 +440,7 @@ def __init__(self, args):
         self.executor_input_ipc_name = args.executor_input_ipc
         self.executor_output_ipc_name = args.executor_output_ipc
         self.model_path = args.model_path
+        self.use_hfcache = args.use_hfcache
 
     async def run_uvicorn(self):
         """
@@ -467,6 +475,7 @@ def run(self):
                 self.executor_input_ipc_name,
                 self.executor_output_ipc_name,
                 self.model_path,
+                self.use_hfcache,
             )
         )
         asyncio.run(self.run_tasks())
diff --git a/src/parallax/server/server_args.py b/src/parallax/server/server_args.py
@@ -200,6 +200,13 @@ def parse_args() -> argparse.Namespace:
         help="GPU backend to use",
     )
 
+    parser.add_argument(
+        "--use-hfcache",
+        action="store_true",
+        default=False,
+        help="Use local Hugging Face cache only (no network download)",
+    )
+
     args = parser.parse_args()
 
     # Validate arguments
diff --git a/src/parallax/server/shard_loader.py b/src/parallax/server/shard_loader.py
@@ -36,6 +36,7 @@ def __init__(
         *,
         start_layer: Optional[int] = None,
         end_layer: Optional[int] = None,
+        use_hfcache: bool = False,
     ):
         """
         Initializes the model loader.
@@ -47,10 +48,12 @@ def __init__(
                                          Defaults to the beginning of the model.
             end_layer (Optional[int]): The ending layer index for the shard (exclusive).
                                        Defaults to the end of the model.
+            use_hfcache (bool): If True, use local Hugging Face cache only (no network download).
         """
         self.model_path_str = model_path_or_hf_repo
         self.start_layer = start_layer
         self.end_layer = end_layer
+        self.use_hfcache = use_hfcache
         self.register_block_class()
 
     def register_block_class(self):
@@ -113,6 +116,7 @@ def load(
                 self.model_path_str,
                 start_layer=self.start_layer,
                 end_layer=self.end_layer,
+                local_files_only=self.use_hfcache,
             )
         else:
             model_path = get_model_path(self.model_path_str)[0]
diff --git a/src/parallax/sglang/model_runner.py b/src/parallax/sglang/model_runner.py
@@ -241,26 +241,25 @@ def initialize_sgl_model_runner(
     """
     apply_parallax_sglang_monkey_patch()
 
+    # Extract TP-related parameters from kwargs or use defaults
+    tp_rank = kwargs.get("tp_rank", 0)
+    tp_size = kwargs.get("tp_size", 1)
+    use_hfcache = kwargs.get("use_hfcache", False)
+    nccl_port = kwargs.get("nccl_port", None)
     # Use selective download for GPU models to save bandwidth and disk space
     from parallax.utils.selective_download import get_model_path_with_selective_download
 
     logger.info(
         f"Downloading model with selective weight files for layers [{start_layer}, {end_layer})"
     )
     model_path = get_model_path_with_selective_download(
-        model_repo,
-        start_layer=start_layer,
-        end_layer=end_layer,
+        model_repo, start_layer=start_layer, end_layer=end_layer, use_hfcache=use_hfcache
     )
 
     config = load_config(model_path)
     tokenizer = load_tokenizer(model_path, eos_token_ids=config.get("eos_token_id", None))
     dtype = config.get("torch_dtype", "bfloat16")
 
-    # Extract TP-related parameters from kwargs or use defaults
-    tp_rank = kwargs.get("tp_rank", 0)
-    tp_size = kwargs.get("tp_size", 1)
-    nccl_port = kwargs.get("nccl_port", None)
     if nccl_port is None:
         nccl_port = random.randint(4000, 5000)
 
diff --git a/src/parallax/utils/selective_download.py b/src/parallax/utils/selective_download.py
@@ -24,6 +24,7 @@ def download_metadata_only(
     repo_id: str,
     cache_dir: Optional[str] = None,
     force_download: bool = False,
+    local_files_only: bool = False,
 ) -> Path:
     # If a local path is provided, return it directly without contacting HF Hub
     local_path = Path(repo_id)
@@ -35,6 +36,7 @@ def download_metadata_only(
         cache_dir=cache_dir,
         ignore_patterns=EXCLUDE_WEIGHT_PATTERNS,
         force_download=force_download,
+        local_files_only=local_files_only,
     )
     return Path(path)
 
@@ -45,6 +47,7 @@ def selective_model_download(
     end_layer: Optional[int] = None,
     cache_dir: Optional[str] = None,
     force_download: bool = False,
+    local_files_only: bool = False,
 ) -> Path:
     # Handle local model directory
     local_path = Path(repo_id)
@@ -58,6 +61,7 @@ def selective_model_download(
             repo_id=repo_id,
             cache_dir=cache_dir,
             force_download=force_download,
+            local_files_only=local_files_only,
         )
         logger.debug(f"Downloaded model metadata to {model_path}")
         is_remote = True
@@ -78,6 +82,7 @@ def selective_model_download(
                     repo_id=repo_id,
                     cache_dir=cache_dir,
                     force_download=force_download,
+                    local_files_only=local_files_only,
                 )
             else:
                 # Step 3: Download only the needed weight files
@@ -90,6 +95,7 @@ def selective_model_download(
                         filename=weight_file,
                         cache_dir=cache_dir,
                         force_download=force_download,
+                        local_files_only=local_files_only,
                     )
 
                 logger.debug(f"Downloaded weight files for layers [{start_layer}, {end_layer})")
@@ -104,6 +110,7 @@ def selective_model_download(
                 repo_id=repo_id,
                 cache_dir=cache_dir,
                 force_download=force_download,
+                local_files_only=local_files_only,
             )
         else:
             logger.debug("No layer range specified and using local path; nothing to download")
@@ -115,9 +122,11 @@ def get_model_path_with_selective_download(
     model_path_or_repo: str,
     start_layer: Optional[int] = None,
     end_layer: Optional[int] = None,
+    local_files_only: bool = False,
 ) -> Path:
     return selective_model_download(
         repo_id=model_path_or_repo,
         start_layer=start_layer,
         end_layer=end_layer,
+        local_files_only=local_files_only,
     )
diff --git a/tests/test_server_args.py b/tests/test_server_args.py
@@ -91,6 +91,7 @@ def test_create_config(self):
             tp_rank=0,
             tp_size=1,
             nccl_port=4001,
+            use_hfcache=False,
         )
 
         config = create_executor_config(args)

Original file line number	Diff line number	Diff line change
`@@ -140,6 +140,7 @@ async def serve_index():`
`140`	`140`	`],`
`141`	`141`	`announce_maddrs=args.announce_maddrs,`
`142`	`142`	`http_port=args.port,`
	`143`	`+ use_hfcache=args.use_hfcache,`
`143`	`144`	`)`
`144`	`145`
`145`	`146`	`request_handler.set_scheduler_manage(scheduler_manage)`