diff --git a/models/model_upload/image-classifier/nsfw-image-classifier/config.yaml b/models/model_upload/image-classifier/nsfw-image-classifier/config.yaml index 2833135..363a29e 100644 --- a/models/model_upload/image-classifier/nsfw-image-classifier/config.yaml +++ b/models/model_upload/image-classifier/nsfw-image-classifier/config.yaml @@ -7,7 +7,7 @@ model: model_type_id: "visual-classifier" build_info: - python_version: "3.10" + python_version: "3.11" inference_compute_info: cpu_limit: "1" diff --git a/models/model_upload/image-detector/detr-resnet-image-detection/config.yaml b/models/model_upload/image-detector/detr-resnet-image-detection/config.yaml index 4413f9f..671d504 100644 --- a/models/model_upload/image-detector/detr-resnet-image-detection/config.yaml +++ b/models/model_upload/image-detector/detr-resnet-image-detection/config.yaml @@ -7,7 +7,7 @@ model: model_type_id: "visual-detector" build_info: - python_version: "3.10" + python_version: "3.11" inference_compute_info: cpu_limit: "1" diff --git a/models/model_upload/llms/llama-3-8b-instruct/config.yaml b/models/model_upload/llms/llama-3-8b-instruct/config.yaml index a7c3c65..bfb3dc9 100644 --- a/models/model_upload/llms/llama-3-8b-instruct/config.yaml +++ b/models/model_upload/llms/llama-3-8b-instruct/config.yaml @@ -7,7 +7,7 @@ model: model_type_id: "text-to-text" build_info: - python_version: "3.10" + python_version: "3.11" inference_compute_info: cpu_limit: "1" diff --git a/models/model_upload/llms/llama-3_2-1b-instruct/1/model.py b/models/model_upload/llms/llama-3_2-1b-instruct/1/model.py index 6244860..db46132 100644 --- a/models/model_upload/llms/llama-3_2-1b-instruct/1/model.py +++ b/models/model_upload/llms/llama-3_2-1b-instruct/1/model.py @@ -126,11 +126,11 @@ def load_model(self): # if checkpoints section is in config.yaml file then checkpoints will be downloaded at this path during model upload time. checkpoints = os.path.join(os.path.dirname(__file__), "checkpoints") self.tokenizer = AutoTokenizer.from_pretrained(checkpoints,) - self.tokenizer.pad_token = self.tokenizer.eos_token # Set pad token to eos token + self.tokenizer.pad_token = self.tokenizer.eos_token # Set pad token to eos token self.model = AutoModelForCausalLM.from_pretrained( checkpoints, low_cpu_mem_usage=True, - device_map="auto", + device_map=self.device, torch_dtype=torch.bfloat16, ) logger.info("Done loading!") @@ -161,7 +161,8 @@ def predict(self, for text in outputs_text: outputs.append(create_output(text=text, code=status_code_pb2.SUCCESS)) - return service_pb2.MultiOutputResponse(outputs=outputs, status=status_pb2.Status(code=status_code_pb2.SUCCESS)) + return service_pb2.MultiOutputResponse( + outputs=outputs, status=status_pb2.Status(code=status_code_pb2.SUCCESS)) def generate(self, request: service_pb2.PostModelOutputsRequest ) -> Iterator[service_pb2.MultiOutputResponse]: @@ -208,7 +209,8 @@ def generate(self, request: service_pb2.PostModelOutputsRequest outputs[idx].data.text.raw = text # Append new text to each output outputs[idx].status.code = status_code_pb2.SUCCESS # Yield the current outputs - yield service_pb2.MultiOutputResponse(outputs=outputs, status=status_pb2.Status(code=status_code_pb2.SUCCESS)) + yield service_pb2.MultiOutputResponse( + outputs=outputs, status=status_pb2.Status(code=status_code_pb2.SUCCESS)) finally: thread.join() diff --git a/models/model_upload/llms/llama-3_2-1b-instruct/config.yaml b/models/model_upload/llms/llama-3_2-1b-instruct/config.yaml index 34d43ac..ab3e3ec 100644 --- a/models/model_upload/llms/llama-3_2-1b-instruct/config.yaml +++ b/models/model_upload/llms/llama-3_2-1b-instruct/config.yaml @@ -7,7 +7,7 @@ model: model_type_id: "text-to-text" build_info: - python_version: "3.10" + python_version: "3.11" inference_compute_info: cpu_limit: "1" diff --git a/models/model_upload/llms/lmdeploy-llama-3_2-1b-instruct/config.yaml b/models/model_upload/llms/lmdeploy-llama-3_2-1b-instruct/config.yaml index abc33ea..40b691c 100644 --- a/models/model_upload/llms/lmdeploy-llama-3_2-1b-instruct/config.yaml +++ b/models/model_upload/llms/lmdeploy-llama-3_2-1b-instruct/config.yaml @@ -7,7 +7,7 @@ model: model_type_id: "text-to-text" build_info: - python_version: "3.10" + python_version: "3.11" inference_compute_info: cpu_limit: "2" diff --git a/models/model_upload/llms/openai-gpt4/config.yaml b/models/model_upload/llms/openai-gpt4/config.yaml index 52faa80..cd326bc 100644 --- a/models/model_upload/llms/openai-gpt4/config.yaml +++ b/models/model_upload/llms/openai-gpt4/config.yaml @@ -7,7 +7,7 @@ model: model_type_id: "text-to-text" build_info: - python_version: "3.10" + python_version: "3.11" inference_compute_info: cpu_limit: "1" diff --git a/models/model_upload/llms/vllm-mistral-7b-instruct/config.yaml b/models/model_upload/llms/vllm-mistral-7b-instruct/config.yaml index 83473f4..944b20d 100644 --- a/models/model_upload/llms/vllm-mistral-7b-instruct/config.yaml +++ b/models/model_upload/llms/vllm-mistral-7b-instruct/config.yaml @@ -7,7 +7,7 @@ model: model_type_id: "text-to-text" build_info: - python_version: "3.10" + python_version: "3.11" inference_compute_info: cpu_limit: "1" diff --git a/models/model_upload/multimodal_models/vllm-miniCPM-2.6/config.yaml b/models/model_upload/multimodal_models/vllm-miniCPM-2.6/config.yaml index a08872f..798b9ef 100644 --- a/models/model_upload/multimodal_models/vllm-miniCPM-2.6/config.yaml +++ b/models/model_upload/multimodal_models/vllm-miniCPM-2.6/config.yaml @@ -7,7 +7,7 @@ model: model_type_id: "multimodal-to-text" build_info: - python_version: "3.10" + python_version: "3.11" inference_compute_info: cpu_limit: "1" diff --git a/models/model_upload/ocr/got-ocr2.0/1/model.py b/models/model_upload/ocr/got-ocr2.0/1/model.py index 57aa89b..fdf0aa9 100644 --- a/models/model_upload/ocr/got-ocr2.0/1/model.py +++ b/models/model_upload/ocr/got-ocr2.0/1/model.py @@ -38,10 +38,9 @@ def load_model(self): checkpoint_path, trust_remote_code=True, use_safetensors=True, - device_map="cuda", + device_map= self.device, low_cpu_mem_usage=True, pad_token_id=self.tokenizer.eos_token_id) - self.model.eval().cuda() logger.info("Done loading Model checkpoints!") def predict(self, request: service_pb2.PostModelOutputsRequest diff --git a/models/model_upload/ocr/got-ocr2.0/config.yaml b/models/model_upload/ocr/got-ocr2.0/config.yaml index 93bb28f..2c190f9 100644 --- a/models/model_upload/ocr/got-ocr2.0/config.yaml +++ b/models/model_upload/ocr/got-ocr2.0/config.yaml @@ -7,7 +7,7 @@ model: model_type_id: "image-to-text" build_info: - python_version: "3.10" + python_version: "3.11" inference_compute_info: cpu_limit: "1" diff --git a/models/model_upload/speech-recognition/openai-whisper/config.yaml b/models/model_upload/speech-recognition/openai-whisper/config.yaml index 519ae3e..9db2ffe 100644 --- a/models/model_upload/speech-recognition/openai-whisper/config.yaml +++ b/models/model_upload/speech-recognition/openai-whisper/config.yaml @@ -7,7 +7,7 @@ model: model_type_id: "audio-to-text" build_info: - python_version: "3.10" + python_version: "3.12" inference_compute_info: cpu_limit: "1" diff --git a/models/model_upload/test-upload/mbart/1/model.py b/models/model_upload/test-upload/mbart/1/model.py index b1a5239..6e06aff 100644 --- a/models/model_upload/test-upload/mbart/1/model.py +++ b/models/model_upload/test-upload/mbart/1/model.py @@ -39,7 +39,7 @@ def load_model(self): # if checkpoints section is in config.yaml file then checkpoints will be downloaded at this path during model upload time. self.tokenizer = AutoTokenizer.from_pretrained(checkpoints) self.model = AutoModelForSeq2SeqLM.from_pretrained( - checkpoints, torch_dtype="auto", device_map="auto") + checkpoints, torch_dtype="auto", device_map=self.device) def predict(self, request: service_pb2.PostModelOutputsRequest ) -> Iterator[service_pb2.MultiOutputResponse]: @@ -51,7 +51,6 @@ def predict(self, request: service_pb2.PostModelOutputsRequest raw_texts = [] for t in texts: inputs = self.tokenizer.encode(t, return_tensors="pt").to(self.device) - # inputs = self.tokenizer.encode("Translate to English: Je t'aime.", return_tensors="pt").to(self.device) outputs = self.model.generate(inputs) print(self.tokenizer.decode(outputs[0])) raw_texts.append(self.tokenizer.decode(outputs[0]))