lm-sys · dbtzy · Dec 2, 2023 · Dec 8, 2023 · Dec 10, 2023 · May 16, 2024
diff --git a/fastchat/model/model_chatglm.py b/fastchat/model/model_chatglm.py
@@ -37,6 +37,35 @@ def process_response(response):
     return response
 
 
+def apply_stopping_string(reply, stop_strings):
+    if isinstance(stop_strings, str):
+        stop_strings = [stop_strings]
+
+    stop_found = False
+
+    for string in stop_strings[:4]:
+        if isinstance(string, str):
+            idx = reply.find(string)
+            if idx != -1:
+                reply = reply[:idx]
+                stop_found = True
+
+    if not stop_found:
+        # If something like "\nYo" is generated just before "\nYou: is completed, trim it
+        for string in stop_strings[:4]:
+            if isinstance(string, str):
+                for j in range(len(string) - 1, 0, -1):
+                    if reply[-j:] == string[:j]:
+                        reply = reply[:-j]
+                        break
+                else:
+                    continue
+
+                break
+
+    return stop_found, reply
+
+
 @torch.inference_mode()
 def generate_stream_chatglm(
     model,
@@ -53,6 +82,7 @@ def generate_stream_chatglm(
     top_p = float(params.get("top_p", 1.0))
     max_new_tokens = int(params.get("max_new_tokens", 256))
     echo = params.get("echo", True)
+    stop = params.get("stop", [])
 
     inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
     input_echo_len = len(inputs["input_ids"][0])
@@ -78,6 +108,10 @@ def generate_stream_chatglm(
         response = tokenizer.decode(output_ids)
         response = process_response(response)
 
+        stop_found, response = (
+            apply_stopping_string(response, stop) if response else (False, response)
+        )
+
         yield {
             "text": response,
             "usage": {
@@ -88,6 +122,9 @@ def generate_stream_chatglm(
             "finish_reason": None,
         }
 
+        if stop_found:
+            break
+
     # TODO: ChatGLM stop when it reach max length
     # Only last stream result contains finish_reason, we set finish_reason as stop
     ret = {