|
18 | 18 | import os |
19 | 19 | import re |
20 | 20 | import shutil |
21 | | -import subprocess |
22 | | -from copy import deepcopy |
23 | 21 |
|
24 | 22 | import defs.ci_profiler |
25 | 23 | import pytest |
26 | | -from defs.common import (convert_weights, generate_summary_cmd, |
27 | | - get_cpp_benchmark, get_trt_llm_lib_dir, parse_output, |
| 24 | +from defs.common import (convert_weights, generate_summary_cmd, parse_output, |
28 | 25 | quantize_data, similar, |
29 | 26 | test_llm_torch_multi_lora_support, |
30 | 27 | test_multi_lora_support, venv_check_call, |
@@ -2683,184 +2680,6 @@ def test_llm_llama_v1_multiple_lora_1gpu(data_type, lora_data_type, |
2683 | 2680 | venv_check_call(llm_venv, run_cmd) |
2684 | 2681 |
|
2685 | 2682 |
|
2686 | | -@pytest.mark.skip_less_device_memory(80000) |
2687 | | -@pytest.mark.skip_less_device(2) |
2688 | | -@pytest.mark.parametrize("llama_model_root", ['llama-v2-13b-hf'], indirect=True) |
2689 | | -@pytest.mark.parametrize("llm_lora_model_root", ["chinese-llama-2-lora-13b"], |
2690 | | - ids=["chinese_lora"], |
2691 | | - indirect=True) |
2692 | | -def test_llm_llama_v2_lora_benchmark_2gpu(llama_example_root, llama_model_root, |
2693 | | - llm_venv, llm_root, cmodel_dir, |
2694 | | - engine_dir, llm_lora_model_root): |
2695 | | - "benchmark llama with multi lora on 2gpu" |
2696 | | - print("Build engines...") |
2697 | | - |
2698 | | - num_layers = 40 |
2699 | | - num_lora_mods = 7 |
2700 | | - max_lora_rank = 64 |
2701 | | - max_len = 1024 |
2702 | | - max_batch = 32 |
2703 | | - eos_id = 2 |
2704 | | - num_loras = (8, 16) |
2705 | | - num_requests = 1024 |
2706 | | - |
2707 | | - model_dir = convert_weights(llm_venv=llm_venv, |
2708 | | - example_root=llama_example_root, |
2709 | | - cmodel_dir=cmodel_dir, |
2710 | | - model="llama-lora", |
2711 | | - model_path=llama_model_root, |
2712 | | - gpus=2, |
2713 | | - tp_size=2, |
2714 | | - data_type="float16") |
2715 | | - |
2716 | | - print("Build engines...") |
2717 | | - build_cmd = [ |
2718 | | - "trtllm-build", |
2719 | | - f"--checkpoint_dir={model_dir}", |
2720 | | - f"--output_dir={engine_dir}", |
2721 | | - f"--max_batch_size={max_batch}", |
2722 | | - f"--max_input_len={max_len}", |
2723 | | - f"--max_seq_len={2 * max_len}", |
2724 | | - "--gemm_plugin=float16", |
2725 | | - "--lora_plugin=float16", |
2726 | | - "--use_paged_context_fmha=enable", |
2727 | | - "--lora_target_modules", |
2728 | | - "attn_q", |
2729 | | - "attn_k", |
2730 | | - "attn_v", |
2731 | | - "attn_dense", |
2732 | | - "mlp_h_to_4h", |
2733 | | - "mlp_4h_to_h", |
2734 | | - "mlp_gate", |
2735 | | - f"--max_lora_rank={max_lora_rank}", |
2736 | | - ] |
2737 | | - check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env) |
2738 | | - |
2739 | | - print("Convert LoRA to cpp format") |
2740 | | - convert_cmd = [ |
2741 | | - "python", |
2742 | | - f"{llama_example_root}/../../../hf_lora_convert.py", |
2743 | | - f"-i={llm_lora_model_root}", |
2744 | | - "--storage-type=float16", |
2745 | | - f"-o={llm_venv.get_working_directory()}/lora_cpp", |
2746 | | - ] |
2747 | | - check_call(" ".join(convert_cmd), shell=True, env=llm_venv._new_env) |
2748 | | - |
2749 | | - print("Prepare datasets") |
2750 | | - benchmark_root = f"{llama_example_root}/../../../../benchmarks/cpp" |
2751 | | - lora_eg = f"{llm_venv.get_working_directory()}/lora-eg" |
2752 | | - base_dataset_cmd = [ |
2753 | | - f"mkdir -p {lora_eg}/data", |
2754 | | - "&&", |
2755 | | - "python", |
2756 | | - f"{benchmark_root}/prepare_dataset.py", |
2757 | | - f"--output={lora_eg}/data/token-norm-dist.json", |
2758 | | - f"--tokenizer={llama_model_root}", |
2759 | | - "token-norm-dist", |
2760 | | - f"--num-requests={num_requests}", |
2761 | | - "--input-mean=256", |
2762 | | - "--input-stdev=16", |
2763 | | - "--output-mean=128", |
2764 | | - "--output-stdev 24", |
2765 | | - ] |
2766 | | - check_call(" ".join(base_dataset_cmd), shell=True, env=llm_venv._new_env) |
2767 | | - |
2768 | | - for nloras in num_loras: |
2769 | | - lora_dataset_cmd = [ |
2770 | | - "python", |
2771 | | - f"{benchmark_root}/prepare_dataset.py", |
2772 | | - f"--output={lora_eg}/data/token-norm-dist-lora-{nloras}.json", |
2773 | | - f"--rand-task-id 0 {nloras-1}", |
2774 | | - f"--tokenizer={llama_model_root}", |
2775 | | - "token-norm-dist", |
2776 | | - f"--num-requests={num_requests}", |
2777 | | - "--input-mean=256", |
2778 | | - "--input-stdev=16", |
2779 | | - "--output-mean=128", |
2780 | | - "--output-stdev 24", |
2781 | | - ] |
2782 | | - check_call(" ".join(lora_dataset_cmd), |
2783 | | - shell=True, |
2784 | | - env=llm_venv._new_env) |
2785 | | - |
2786 | | - print("Generate random lora weights for 16 adapters") |
2787 | | - |
2788 | | - lora_weights_cmd = [ |
2789 | | - "python", f"{benchmark_root}/utils/generate_rand_loras.py", |
2790 | | - f"{llm_venv.get_working_directory()}/lora_cpp", f"{lora_eg}/loras", "16" |
2791 | | - ] |
2792 | | - check_call(" ".join(lora_weights_cmd), shell=True, env=llm_venv._new_env) |
2793 | | - |
2794 | | - benchmark_exe = get_cpp_benchmark('gptManagerBenchmark', llm_root) |
2795 | | - envs = deepcopy(os.environ) |
2796 | | - _ = envs.pop("CUDA_VISIBLE_DEVICES", "") |
2797 | | - envs[ |
2798 | | - "LD_LIBRARY_PATH"] = f'{get_trt_llm_lib_dir(llm_venv)}:{os.path.dirname(benchmark_exe)}:{envs.get("LD_LIBRARY_PATH", "")}' |
2799 | | - |
2800 | | - print( |
2801 | | - f'CUDA_VISIBLE_DEVICES: {os.environ.get("CUDA_VISIBLE_DEVICES", None)}') |
2802 | | - |
2803 | | - print("Perform base model benchmarking") |
2804 | | - check_call(f"mkdir -p {lora_eg}/log-base-lora", shell=True, env=envs) |
2805 | | - base_benchmark_cmd = [ |
2806 | | - f"{benchmark_exe}", |
2807 | | - f"--engine_dir={engine_dir}", |
2808 | | - "--type=IFB", |
2809 | | - f"--dataset={lora_eg}/data/token-norm-dist.json", |
2810 | | - "--lora_host_cache_bytes=8589934592", |
2811 | | - f"--lora_num_device_mod_layers={32 * num_layers * num_lora_mods * max_lora_rank}", |
2812 | | - "--kv_cache_free_gpu_mem_fraction=0.70", |
2813 | | - "--log_level=info", |
2814 | | - f"--eos_id={eos_id}", |
2815 | | - ] |
2816 | | - mpi_cmd = [ |
2817 | | - "mpirun", |
2818 | | - "-n", |
2819 | | - "2", |
2820 | | - "--allow-run-as-root", |
2821 | | - "--output-filename", |
2822 | | - f"{lora_eg}/log-base-lora", |
2823 | | - ] |
2824 | | - base_benchmark_cmd = mpi_cmd + base_benchmark_cmd |
2825 | | - print( |
2826 | | - f"Running gptManagerBenchmark using base cmd: {' '.join(base_benchmark_cmd)}" |
2827 | | - ) |
2828 | | - subprocess.check_output(base_benchmark_cmd, env=envs) |
2829 | | - # check_call(" ".join(base_benchmark_cmd), env=envs) |
2830 | | - |
2831 | | - print("Perform lora model benchmarking") |
2832 | | - for nloras in num_loras: |
2833 | | - check_call(f"mkdir -p {lora_eg}/log-lora-{nloras}", |
2834 | | - shell=True, |
2835 | | - env=envs) |
2836 | | - lora_benchmark_cmd = [ |
2837 | | - f"{benchmark_exe}", |
2838 | | - f"--engine_dir={engine_dir}", |
2839 | | - "--type=IFB", |
2840 | | - f"--dataset={lora_eg}/data/token-norm-dist-lora-{nloras}.json", |
2841 | | - "--lora_host_cache_bytes=8589934592", |
2842 | | - f"--lora_num_device_mod_layers={16 * num_layers * num_lora_mods * max_lora_rank}", |
2843 | | - "--kv_cache_free_gpu_mem_fraction=0.70", |
2844 | | - "--log_level=info", |
2845 | | - f"--eos_id={eos_id}", |
2846 | | - f"--lora_dir={lora_eg}/loras", |
2847 | | - ] |
2848 | | - mpi_cmd = [ |
2849 | | - "mpirun", |
2850 | | - "-n", |
2851 | | - "2", |
2852 | | - "--allow-run-as-root", |
2853 | | - "--output-filename", |
2854 | | - f"{lora_eg}/log-lora-{nloras}", |
2855 | | - ] |
2856 | | - lora_benchmark_cmd = mpi_cmd + lora_benchmark_cmd |
2857 | | - print( |
2858 | | - f"Running gptManagerBenchmark using lora cmd: {' '.join(lora_benchmark_cmd)}" |
2859 | | - ) |
2860 | | - subprocess.check_output(lora_benchmark_cmd, env=envs) |
2861 | | - # check_call(lora_benchmark_cmd, env=envs) |
2862 | | - |
2863 | | - |
2864 | 2683 | @pytest.mark.timeout(7200) |
2865 | 2684 | @pytest.mark.skip_less_device_memory(80000) |
2866 | 2685 | @pytest.mark.skip_less_device(4) |
|
0 commit comments