Skip to content
This repository was archived by the owner on Oct 9, 2024. It is now read-only.

Commit a17b7d3

Browse files
authored
drop duplicate html files (#56)
* drop duplicate html and js * fix model load path * update dockerfile
1 parent 4fe1cb9 commit a17b7d3

File tree

19 files changed

+132
-269
lines changed

19 files changed

+132
-269
lines changed

Dockerfile

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,17 @@ RUN conda install -c anaconda cmake -y
2626

2727
# necessary stuff
2828
RUN pip install torch==1.12.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116 \
29-
transformers==4.25.1 \
29+
transformers==4.26.1 \
3030
deepspeed==0.7.6 \
31-
accelerate==0.15.0 \
31+
accelerate==0.16.0 \
3232
gunicorn==20.1.0 \
3333
flask \
3434
flask_api \
3535
fastapi==0.89.1 \
3636
uvicorn==0.19.0 \
3737
jinja2==3.1.2 \
3838
pydantic==1.10.2 \
39-
huggingface_hub==0.10.1 \
39+
huggingface_hub==0.12.1 \
4040
grpcio-tools==1.50.0 \
4141
--no-cache-dir
4242

@@ -64,5 +64,4 @@ CMD git clone https://github.com/huggingface/transformers-bloom-inference.git &&
6464
cd transformers-bloom-inference && \
6565
# install grpc and compile protos
6666
make gen-proto && \
67-
make ui model_name=bigscience/bloom-560m && \
6867
make bloom-560m

Makefile

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,13 @@ gen-proto:
1010

1111
rm -rf inference_server/model_handler/grpc_utils/pb/*.py-e
1212

13+
ui:
14+
python -m ui --ui_host 127.0.0.1 --ui_port 5001 --generation_backend_host 127.0.0.1 --generation_backend_port 5000 &
15+
1316
# ------------------------- DS inference -------------------------
1417
bloom-176b:
18+
make ui
19+
1520
TOKENIZERS_PARALLELISM=false \
1621
MODEL_NAME=bigscience/bloom \
1722
MODEL_CLASS=AutoModelForCausalLM \
@@ -24,6 +29,8 @@ bloom-176b:
2429

2530
# loads faster than the above one
2631
microsoft-bloom-176b:
32+
make ui
33+
2734
TOKENIZERS_PARALLELISM=false \
2835
MODEL_NAME=microsoft/bloom-deepspeed-inference-fp16 \
2936
MODEL_CLASS=AutoModelForCausalLM \
@@ -35,6 +42,8 @@ microsoft-bloom-176b:
3542
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
3643

3744
bloomz-176b:
45+
make ui
46+
3847
TOKENIZERS_PARALLELISM=false \
3948
MODEL_NAME=bigscience/bloomz \
4049
MODEL_CLASS=AutoModelForCausalLM \
@@ -46,6 +55,8 @@ bloomz-176b:
4655
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
4756

4857
bloom-176b-int8:
58+
make ui
59+
4960
TOKENIZERS_PARALLELISM=false \
5061
MODEL_NAME=microsoft/bloom-deepspeed-inference-int8 \
5162
MODEL_CLASS=AutoModelForCausalLM \
@@ -58,6 +69,8 @@ bloom-176b-int8:
5869

5970
# ------------------------- HF accelerate -------------------------
6071
bloom-560m:
72+
make ui
73+
6174
TOKENIZERS_PARALLELISM=false \
6275
MODEL_NAME=bigscience/bloom-560m \
6376
MODEL_CLASS=AutoModelForCausalLM \
@@ -69,6 +82,8 @@ bloom-560m:
6982
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
7083

7184
flan-t5-xxl:
85+
make ui
86+
7287
TOKENIZERS_PARALLELISM=false \
7388
MODEL_NAME=google/flan-t5-xxl \
7489
MODEL_CLASS=AutoModelForSeq2SeqLM \
@@ -80,6 +95,8 @@ flan-t5-xxl:
8095
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
8196

8297
ul2:
98+
make ui
99+
83100
TOKENIZERS_PARALLELISM=false \
84101
MODEL_NAME=google/ul2 \
85102
MODEL_CLASS=AutoModelForSeq2SeqLM \
@@ -91,6 +108,8 @@ ul2:
91108
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
92109

93110
codegen-mono:
111+
make ui
112+
94113
TOKENIZERS_PARALLELISM=false \
95114
MODEL_NAME=Salesforce/codegen-16B-mono \
96115
MODEL_CLASS=AutoModelForCausalLM \
@@ -103,22 +122,23 @@ codegen-mono:
103122

104123
# ------------------------- HF CPU -------------------------
105124
bloom-560m-cpu:
125+
make ui
126+
106127
MODEL_NAME=bigscience/bloom-560m \
107128
MODEL_CLASS=AutoModelForCausalLM \
108129
DEPLOYMENT_FRAMEWORK=hf_cpu \
109-
DTYPE=bf16 \
130+
DTYPE=fp32 \
110131
MAX_INPUT_LENGTH=2048 \
111132
MAX_BATCH_SIZE=32 \
112133
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
113134

114135
flan-t5-base-cpu:
136+
make ui
137+
115138
MODEL_NAME=google/flan-t5-base \
116139
MODEL_CLASS=AutoModelForSeq2SeqLM \
117140
DEPLOYMENT_FRAMEWORK=hf_cpu \
118141
DTYPE=bf16 \
119142
MAX_INPUT_LENGTH=2048 \
120143
MAX_BATCH_SIZE=32 \
121144
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
122-
123-
ui:
124-
python -m ui --model_name $(model_name) &

inference_server/benchmark.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
get_dummy_batch,
1515
get_world_size,
1616
parse_args,
17-
print_rank_n,
17+
print_rank_0,
1818
run_and_log_time,
1919
)
2020

@@ -49,18 +49,18 @@ def benchmark_end_to_end(args: argparse.Namespace) -> None:
4949

5050
request = create_generate_request(get_dummy_batch(args.batch_size), args.generate_kwargs)
5151

52-
print_rank_n(f"generate_kwargs = {args.generate_kwargs}")
53-
print_rank_n(f"batch_size = {args.batch_size}")
52+
print_rank_0(f"generate_kwargs = {args.generate_kwargs}")
53+
print_rank_0(f"batch_size = {args.batch_size}")
5454

5555
# warmup is a must if measuring speed as it's when all the optimizations are performed
5656
# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs
5757
response = model.generate(request=request)
5858

5959
for i, (o, _) in zip(request.text, zip(response.text, response.num_generated_tokens)):
60-
print_rank_n(f"{'-' * 60}\nin = {i}\nout = {o}\n")
60+
print_rank_0(f"{'-' * 60}\nin = {i}\nout = {o}\n")
6161

6262
if args.benchmark_cycles > 0:
63-
print_rank_n("*** Running benchmark")
63+
print_rank_0("*** Running benchmark")
6464

6565
torch.cuda.empty_cache()
6666
gc.collect()
@@ -78,7 +78,7 @@ def benchmark_end_to_end(args: argparse.Namespace) -> None:
7878
if args.deployment_framework == DS_ZERO:
7979
total_new_tokens_generated *= get_world_size()
8080

81-
print_rank_n(
81+
print_rank_0(
8282
get_benchmark_results(
8383
benchmark_time, initialization_time, total_new_tokens_generated, args.batch_size, args.benchmark_cycles
8484
)

inference_server/cli.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import sys
44

55
from .model_handler import ModelDeployment
6-
from .utils import get_argument_parser, parse_args, print_rank_n
6+
from .utils import get_argument_parser, parse_args, print_rank_0
77

88

99
def get_args() -> argparse.Namespace:
@@ -35,8 +35,8 @@ def main() -> None:
3535

3636
response = model.generate(text=[input_text], generate_kwargs=generate_kwargs)
3737

38-
print_rank_n("Output text:", response.text[0])
39-
print_rank_n("Generated tokens:", response.num_generated_tokens[0])
38+
print_rank_0("Output text:", response.text[0])
39+
print_rank_0("Generated tokens:", response.num_generated_tokens[0])
4040

4141

4242
if __name__ == "__main__":

inference_server/model_handler/deployment.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
get_cuda_visible_devices,
2222
get_str_dtype,
2323
get_world_size,
24-
print_rank_n,
24+
print_rank_0,
2525
)
2626
from .grpc_utils.pb import generation_pb2, generation_pb2_grpc
2727

@@ -53,7 +53,7 @@ def __init__(self, args: argparse.Namespace, grpc_allowed: bool = False):
5353
else:
5454
self.model = get_model_class(args.deployment_framework)(args)
5555

56-
print_rank_n("model loaded")
56+
print_rank_0("model loaded")
5757

5858
def should_use_grpc(self, deployment_framework: str, grpc_allowed: bool) -> bool:
5959
if grpc_allowed and get_world_size() > 1:
@@ -94,8 +94,8 @@ def _wait_until_server_is_live(self):
9494
if not process_alive:
9595
raise RuntimeError("server crashed for some reason, unable to proceed")
9696
time.sleep(4)
97-
print_rank_n("waiting for server to start...")
98-
print_rank_n(f"server has started on {self.ports[0]}")
97+
print_rank_0("waiting for server to start...")
98+
print_rank_0(f"server has started on {self.ports[0]}")
9999

100100
def dict_to_proto(self, generate_kwargs: dict) -> dict:
101101
result = {}
@@ -229,9 +229,3 @@ def tokenize(self, request: TokenizeRequest) -> TokenizeResponse:
229229
response = self.model.tokenize(request)
230230

231231
return response
232-
233-
def _request_response(self):
234-
raise NotImplementedError("This method should not be implemented")
235-
236-
def query(self):
237-
raise NotImplementedError("This method should not be implemented")

inference_server/model_handler/grpc_utils/generation_server.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
# from ...constants import GRPC_MAX_MSG_SIZE
99
from ...models import Model
10-
from ...utils import ForwardRequest, create_generate_request, print_rank_n
10+
from ...utils import ForwardRequest, TokenizeRequest, create_generate_request, print_rank_0
1111
from .pb import generation_pb2, generation_pb2_grpc
1212

1313

@@ -33,10 +33,14 @@ def Generate(self, request, context):
3333

3434
if isinstance(response, Exception):
3535
# if exception occurs, we don't this subprocess to crash
36-
response = generation_pb2.GenerationResponseProto(error=str(response))
36+
response = generation_pb2.GenerationResponseProto(
37+
error=str(response), is_encoder_decoder=response.is_encoder_decoder
38+
)
3739
else:
3840
response = generation_pb2.GenerationResponseProto(
39-
texts=response.text, num_generated_tokens=response.num_generated_tokens
41+
texts=response.text,
42+
num_generated_tokens=response.num_generated_tokens,
43+
is_encoder_decoder=response.is_encoder_decoder,
4044
)
4145

4246
return response
@@ -55,9 +59,13 @@ def Forward(self, request, context):
5559

5660
if isinstance(response, Exception):
5761
# if exception occurs, we don't this subprocess to crash
58-
response = generation_pb2.ForwardResponseProto(error=str(response))
62+
response = generation_pb2.ForwardResponseProto(
63+
error=str(response), is_encoder_decoder=response.is_encoder_decoder
64+
)
5965
else:
60-
response = generation_pb2.ForwardResponseProto(nll=response.nll)
66+
response = generation_pb2.ForwardResponseProto(
67+
nll=response.nll, is_encoder_decoder=response.is_encoder_decoder
68+
)
6169

6270
return response
6371

@@ -72,7 +80,7 @@ def serve(inference_pipeline, port):
7280
)
7381
generation_pb2_grpc.add_GenerationServiceServicer_to_server(GenerationServer(inference_pipeline), server)
7482
server.add_insecure_port(f"[::]:{port}")
75-
print_rank_n("About to start server")
83+
print_rank_0("About to start server")
7684
server.start()
77-
print_rank_n("Started")
85+
print_rank_0("Started")
7886
server.wait_for_termination()

inference_server/model_handler/grpc_utils/pb/generation_pb2.py

Lines changed: 8 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

inference_server/model_handler/grpc_utils/proto/generation.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ message GenerationResponseProto {
2424
repeated string texts = 1;
2525
repeated int32 num_generated_tokens = 2;
2626
string error = 3;
27+
bool is_encoder_decoder = 4;
2728
}
2829

2930
message ForwardRequestProto {
@@ -34,4 +35,5 @@ message ForwardRequestProto {
3435
message ForwardResponseProto {
3536
float nll = 1;
3637
string error = 2;
38+
bool is_encoder_decoder = 3;
3739
}

inference_server/models/ds_inference.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,14 +74,15 @@ def __init__(self, model_path: str):
7474
self.tmp_file = os.path.join(self.tmp_directory, "checkpoints.json")
7575
self.model_path = model_path
7676

77-
def write_checkpoints_json(self, model_path: str) -> None:
77+
def write_checkpoints_json(self) -> None:
78+
print(self.model_path)
7879
with io.open(self.tmp_file, "w", encoding="utf-8") as f:
79-
data = {"type": "BLOOM", "checkpoints": glob.glob(f"{model_path}/*.bin"), "version": 1.0}
80+
data = {"type": "BLOOM", "checkpoints": glob.glob(f"{self.model_path}/*.bin"), "version": 1.0}
8081
json.dump(data, f)
8182

8283
def __enter__(self):
83-
run_rank_n(partial(os.makedirs, name=self.tmp_directory, exist_ok=True))
84-
run_rank_n(partial(self.write_checkpoints_json, model_path=self.model_path), barrier=True)
84+
run_rank_n(os.makedirs, barrier=True)(self.tmp_directory, exist_ok=True)
85+
run_rank_n(self.write_checkpoints_json, barrier=True)()
8586
return self.tmp_file
8687

8788
def __exit__(self, type, value, traceback):
@@ -98,7 +99,7 @@ def get_model_path(model_name: str):
9899
if config_path is None:
99100
# treat the model name as an explicit model path
100101
return model_name
101-
elif os.path.isfile(os.path.join(model_name, config_file)):
102+
else:
102103
return os.path.dirname(config_path)
103104
except:
104105
# treat the model name as an explicit model path

inference_server/models/model.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,11 @@ def generate(self, request: GenerateRequest) -> Union[GenerateResponse, Exceptio
9292
else:
9393
generated_text = self.tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
9494

95-
return GenerateResponse(text=generated_text, num_generated_tokens=num_generated_tokens)
95+
return GenerateResponse(
96+
text=generated_text,
97+
num_generated_tokens=num_generated_tokens,
98+
is_encoder_decoder=self.is_encoder_decoder,
99+
)
96100
except Exception as exception:
97101
return exception
98102

@@ -140,13 +144,15 @@ def pad(arrays: list, padding: int, max_length: int = None):
140144

141145
loss = self.model(**input_tokens).loss
142146

143-
return ForwardResponse(nll=loss.item())
147+
return ForwardResponse(nll=loss.item(), is_encoder_decoder=self.is_encoder_decoder)
144148
except Exception as exception:
145149
return exception
146150

147151
def tokenize(self, request: TokenizeRequest) -> TokenizeResponse:
148-
response = self.tokenizer(request.text, padding=request.padding)
149-
return TokenizeResponse(token_ids=response.input_ids, attention_mask=response.attention_mask)
152+
return TokenizeResponse(
153+
token_ids=self.tokenizer(request.text).input_ids,
154+
is_encoder_decoder=self.is_encoder_decoder,
155+
)
150156

151157

152158
def check_max_input_length(input_token_length: int, max_input_length: int) -> None:

0 commit comments

Comments
 (0)