lm-sys · merrymercy · Dec 24, 2023 · Dec 18, 2023 · Dec 18, 2023 · Dec 18, 2023
diff --git a/README.md b/README.md
@@ -109,6 +109,16 @@ LLama 2, Vicuna, Alpaca, Baize, ChatGLM, Dolly, Falcon, FastChat-T5, GPT4ALL, Gu
 
 See a complete list of supported models and instructions to add a new model [here](docs/model_support.md).
 
+#### Use Models from modelscope
+You can use models from www.modelscope.cn, just set environment variable FASTCHAT_USE_MODELSCOPE.
+```
+export FASTCHAT_USE_MODELSCOPE=True
+```
+Example:  
+```
+FASTCHAT_USE_MODELSCOPE=True python3 -m fastchat.serve.cli --model-path qwen/Qwen-7B-Chat --revision v1.1.9
+```
+
 #### Single GPU
 The command below requires around 14GB of GPU memory for Vicuna-7B and 28GB of GPU memory for Vicuna-13B.
 See the ["Not Enough Memory" section](#not-enough-memory) below if you do not have enough memory.

diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
@@ -319,6 +319,19 @@ def load_model(
     if dtype is not None:  # Overwrite dtype if it is provided in the arguments.
         kwargs["torch_dtype"] = dtype
 
+    if os.environ.get("FASTCHAT_USE_MODELSCOPE", "False").lower() == "true":
+        # download model from ModelScope hub,
+        # lazy import so that modelscope is not required for normal use.
+        try:
+            from modelscope.hub.snapshot_download import snapshot_download
+
+            model_path = snapshot_download(model_id=model_path, revision=revision)
+        except ImportError as e:
+            warnings.warn(
+                "Use model from www.modelscope.cn need pip install modelscope"
+            )
+            raise e
+
     # Load model
     model, tokenizer = adapter.load_model(model_path, kwargs)
 

diff --git a/fastchat/serve/model_worker.py b/fastchat/serve/model_worker.py
@@ -49,6 +49,7 @@ def __init__(
         device: str,
         num_gpus: int,
         max_gpu_memory: str,
+        revision: str = None,
         dtype: Optional[torch.dtype] = None,
         load_8bit: bool = False,
         cpu_offloading: bool = False,
@@ -76,6 +77,7 @@ def __init__(
         logger.info(f"Loading the model {self.model_names} on worker {worker_id} ...")
         self.model, self.tokenizer = load_model(
             model_path,
+            revision=revision,
             device=device,
             num_gpus=num_gpus,
             max_gpu_memory=max_gpu_memory,
@@ -345,6 +347,7 @@ def create_model_worker():
         args.model_path,
         args.model_names,
         args.limit_worker_concurrency,
+        revision=args.revision,
         no_register=args.no_register,
         device=args.device,
         num_gpus=args.num_gpus,