-
Notifications
You must be signed in to change notification settings - Fork 742
[Cherry-Pick][Optimization] Enable distributed communication environment variables by default (#7746) #7784
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,7 +16,6 @@ | |
| import queue | ||
| import shutil | ||
| import signal | ||
| import socket | ||
| import subprocess | ||
| import sys | ||
| import time | ||
|
|
@@ -30,6 +29,7 @@ | |
| sys.path.insert(0, project_root) | ||
|
|
||
| from ci_use.EB_Lite_with_adapter.zmq_client import LLMControlClient, LLMReqClient | ||
| from e2e.utils.serving_utils import clean_ports, is_port_open | ||
|
|
||
| env = os.environ.copy() | ||
|
|
||
|
|
@@ -79,88 +79,6 @@ def zmq_control_client(): | |
| return client | ||
|
|
||
|
|
||
| def is_port_open(host: str, port: int, timeout=1.0): | ||
| """ | ||
| Check if a TCP port is open on the given host. | ||
| Returns True if connection succeeds, False otherwise. | ||
| """ | ||
| try: | ||
| with socket.create_connection((host, port), timeout): | ||
| return True | ||
| except Exception: | ||
| return False | ||
|
|
||
|
|
||
| def kill_process_on_port(port: int): | ||
| """ | ||
| Kill processes that are listening on the given port. | ||
| Uses multiple methods to ensure thorough cleanup. | ||
| """ | ||
| current_pid = os.getpid() | ||
| parent_pid = os.getppid() | ||
|
|
||
| # Method 1: Use lsof to find processes | ||
| try: | ||
| output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip() | ||
| for pid in output.splitlines(): | ||
| pid = int(pid) | ||
| if pid in (current_pid, parent_pid): | ||
| print(f"Skip killing current process (pid={pid}) on port {port}") | ||
| continue | ||
| try: | ||
| # First try SIGTERM for graceful shutdown | ||
| os.kill(pid, signal.SIGTERM) | ||
| time.sleep(1) | ||
| # Then SIGKILL if still running | ||
| os.kill(pid, signal.SIGKILL) | ||
| print(f"Killed process on port {port}, pid={pid}") | ||
| except ProcessLookupError: | ||
| pass # Process already terminated | ||
| except subprocess.CalledProcessError: | ||
| pass | ||
|
|
||
| # Method 2: Use netstat and fuser as backup | ||
| try: | ||
| # Find processes using netstat and awk | ||
| cmd = f"netstat -tulpn 2>/dev/null | grep :{port} | awk '{{print $7}}' | cut -d'/' -f1" | ||
| output = subprocess.check_output(cmd, shell=True).decode().strip() | ||
| for pid in output.splitlines(): | ||
| if pid and pid.isdigit(): | ||
| pid = int(pid) | ||
| if pid in (current_pid, parent_pid): | ||
| continue | ||
| try: | ||
| os.kill(pid, signal.SIGKILL) | ||
| print(f"Killed process (netstat) on port {port}, pid={pid}") | ||
| except ProcessLookupError: | ||
| pass | ||
| except (subprocess.CalledProcessError, FileNotFoundError): | ||
| pass | ||
|
|
||
| # Method 3: Use fuser if available | ||
| try: | ||
| subprocess.run(f"fuser -k {port}/tcp", shell=True, timeout=5) | ||
| except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError): | ||
| pass | ||
|
|
||
|
|
||
| def clean_ports(): | ||
| """ | ||
| Kill all processes occupying the ports listed in PORTS_TO_CLEAN. | ||
| """ | ||
| print(f"Cleaning ports: {PORTS_TO_CLEAN}") | ||
| for port in PORTS_TO_CLEAN: | ||
| kill_process_on_port(port) | ||
|
|
||
| # Double check and retry if ports are still in use | ||
| time.sleep(2) | ||
| for port in PORTS_TO_CLEAN: | ||
| if is_port_open("127.0.0.1", port, timeout=0.1): | ||
| print(f"Port {port} still in use, retrying cleanup...") | ||
| kill_process_on_port(port) | ||
| time.sleep(1) | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session", autouse=True) | ||
| def setup_and_run_server(): | ||
| """ | ||
|
|
@@ -170,8 +88,15 @@ def setup_and_run_server(): | |
| - Waits for server port to open (up to 30 seconds) | ||
| - Tears down server after all tests finish | ||
| """ | ||
| # 清理/dev/shm中的临时文件 | ||
| try: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 建议 建议只清理本次测试已知的文件(例如 import glob
for f in glob.glob("/dev/shm/fd_task_queue_*.sock"):
try:
os.remove(f)
except Exception:
pass |
||
| subprocess.run("rm -rf /dev/shm/*", shell=True) | ||
| print("Successfully cleaned up /dev/shm.") | ||
| except Exception as e: | ||
| print(f"Failed to cleanup /dev/shm: {e}") | ||
|
|
||
| print("Pre-test port cleanup...") | ||
| clean_ports() | ||
| clean_ports(PORTS_TO_CLEAN) | ||
|
|
||
| base_path = os.getenv("MODEL_PATH") | ||
| if base_path: | ||
|
|
@@ -236,7 +161,7 @@ def setup_and_run_server(): | |
| print("\n===== Post-test server cleanup... =====") | ||
| try: | ||
| os.killpg(process.pid, signal.SIGTERM) | ||
| clean_ports() | ||
| clean_ports(PORTS_TO_CLEAN) | ||
| print(f"API server (pid={process.pid}) terminated") | ||
| except Exception as e: | ||
| print(f"Failed to terminate API server: {e}") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,31 +13,25 @@ | |
| # limitations under the License. | ||
|
|
||
| import os | ||
| import signal | ||
| import socket | ||
| import subprocess | ||
| import sys | ||
| import time | ||
| import traceback | ||
|
|
||
| import pytest | ||
|
|
||
| from fastdeploy import LLM, SamplingParams | ||
|
|
||
| FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313)) | ||
| FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) | ||
| MAX_WAIT_SECONDS = 60 | ||
|
|
||
| current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
| project_root = os.path.abspath(os.path.join(current_dir, "..", "..")) | ||
| sys.path.insert(0, project_root) | ||
| from e2e.utils.serving_utils import ( | ||
| FD_API_PORT, | ||
| FD_CACHE_QUEUE_PORT, | ||
| FD_ENGINE_QUEUE_PORT, | ||
| clean_ports, | ||
| ) | ||
|
|
||
| def is_port_open(host: str, port: int, timeout=1.0): | ||
| """ | ||
| Check if a TCP port is open on the given host. | ||
| Returns True if connection succeeds, False otherwise. | ||
| """ | ||
| try: | ||
| with socket.create_connection((host, port), timeout): | ||
| return True | ||
| except Exception: | ||
| return False | ||
| MAX_WAIT_SECONDS = 60 | ||
|
|
||
|
|
||
| def format_chat_prompt(messages): | ||
|
|
@@ -74,35 +68,23 @@ def llm(model_path): | |
| """ | ||
| Fixture to initialize the LLM model with a given model path | ||
| """ | ||
| try: | ||
| output = subprocess.check_output(f"lsof -i:{FD_ENGINE_QUEUE_PORT} -t", shell=True).decode().strip() | ||
| for pid in output.splitlines(): | ||
| os.kill(int(pid), signal.SIGKILL) | ||
| print(f"Killed process on port {FD_ENGINE_QUEUE_PORT}, pid={pid}") | ||
| except subprocess.CalledProcessError: | ||
| pass | ||
| # Clean ports before starting the test | ||
| clean_ports() | ||
|
|
||
| try: | ||
| start = time.time() | ||
| llm = LLM( | ||
| model=model_path, | ||
| tensor_parallel_size=1, | ||
| port=FD_API_PORT, | ||
| engine_worker_queue_port=FD_ENGINE_QUEUE_PORT, | ||
| cache_queue_port=FD_CACHE_QUEUE_PORT, | ||
| max_model_len=32768, | ||
| quantization="wint8", | ||
| logits_processors=["LogitBiasLogitsProcessor"], | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 建议 建议恢复轮询或使用更可靠的健康检查: wait_start = time.time()
while not is_port_open("127.0.0.1", FD_ENGINE_QUEUE_PORT):
if time.time() - wait_start > MAX_WAIT_SECONDS:
pytest.fail(f"Engine did not start within {MAX_WAIT_SECONDS}s")
time.sleep(1) |
||
| ) | ||
|
|
||
| # Wait for the port to be open | ||
| wait_start = time.time() | ||
| while not is_port_open("127.0.0.1", FD_ENGINE_QUEUE_PORT): | ||
| if time.time() - wait_start > MAX_WAIT_SECONDS: | ||
| pytest.fail( | ||
| f"Model engine did not start within {MAX_WAIT_SECONDS} seconds on port {FD_ENGINE_QUEUE_PORT}" | ||
| ) | ||
| time.sleep(1) | ||
|
|
||
| time.sleep(2) | ||
| print(f"Model loaded successfully from {model_path} in {time.time() - start:.2f}s.") | ||
| yield llm | ||
| except Exception: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟡 建议
is_file_socket_available()中,对于非ECONNREFUSED/ENOENT的OSError(如EACCES权限拒绝、ECONNABORTED等)直接返回False,将导致端口被误判为「不可用」,最终find_free_ports可能无法找到可用端口。建议修复方式:对非预期错误记录日志并返回
True(保守策略,让 TCP 层绑定去兜底),或显式列举应返回False的错误码: