Spaces:

AiInnovation-AutoSolutionDoc
/

CollegeQATest

Runtime error

App Files Files Community

artistypl commited on Jun 8, 2023

Commit

05ebb09

•

1 Parent(s): db7289c

Create chatllm.py

Browse files

Files changed (1) hide show

chatllm.py +158 -0

chatllm.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import os
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+from langchain.llms.base import LLM
+from langchain.llms.utils import enforce_stop_tokens
+from transformers import AutoModel, AutoTokenizer
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+DEVICE = "cuda"
+DEVICE_ID = "0"
+CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
+def torch_gc():
+    if torch.cuda.is_available():
+        with torch.cuda.device(CUDA_DEVICE):
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
+    # transformer.word_embeddings 占用1层
+    # transformer.final_layernorm 和 lm_head 占用1层
+    # transformer.layers 占用 28 层
+    # 总共30层分配到num_gpus张卡上
+    num_trans_layers = 28
+    per_gpu_layers = 30 / num_gpus
+    # bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
+    # windows下 model.device 会被设置成 transformer.word_embeddings.device
+    # linux下 model.device 会被设置成 lm_head.device
+    # 在调用chat或者stream_chat时,input_ids会被放到model.device上
+    # 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
+    # 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
+    device_map = {'transformer.word_embeddings': 0,
+                  'transformer.final_layernorm': 0, 'lm_head': 0}
+    used = 2
+    gpu_target = 0
+    for i in range(num_trans_layers):
+        if used >= per_gpu_layers:
+            gpu_target += 1
+            used = 0
+        assert gpu_target < num_gpus
+        device_map[f'transformer.layers.{i}'] = gpu_target
+        used += 1
+    return device_map
+class ChatLLM(LLM):
+    max_token: int = 10000
+    temperature: float = 0.1
+    top_p = 0.9
+    history = []
+    tokenizer: object = None
+    model: object = None
+    def __init__(self):
+        super().__init__()
+    @property
+    def _llm_type(self) -> str:
+        return "ChatLLM"
+    def _call(self,
+              prompt: str,
+              stop: Optional[List[str]] = None) -> str:
+        if self.model == 'Minimax':
+            import requests
+            group_id = os.getenv('group_id')
+            api_key = os.getenv('api_key')
+            url = f'https://api.minimax.chat/v1/text/chatcompletion?GroupId={group_id}'
+            headers = {
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/json"
+            }
+            request_body = {
+                "model": "abab5-chat",
+                "tokens_to_generate": 512,
+                'messages': []
+            }
+            for i in self.history:
+                h_input = i[0]
+                h_reply = i[1]
+                request_body['messages'].append({
+                    "sender_type": "USER",
+                    "text": h_input
+                })
+                request_body['messages'].append({"sender_type": "BOT", "text": h_reply})
+            request_body['messages'].append({"sender_type": "USER", "text": prompt})
+            resp = requests.post(url, headers=headers, json=request_body)
+            response = resp.json()['reply']
+            #  将当次的ai回复内容加入messages
+            request_body['messages'].append({"sender_type": "BOT", "text": response})
+            self.history.append((prompt, response))
+        else:
+            response, _ = self.model.chat(
+                self.tokenizer,
+                prompt,
+                history=self.history,
+                max_length=self.max_token,
+                temperature=self.temperature,
+            )
+            torch_gc()
+            if stop is not None:
+                response = enforce_stop_tokens(response, stop)
+            self.history = self.history+[[None, response]]
+        return response
+    def load_model(self,
+                   model_name_or_path: str = "THUDM/chatglm-6b-int4",
+                   llm_device=DEVICE,
+                   device_map: Optional[Dict[str, int]] = None,
+                   **kwargs):
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name_or_path,
+            trust_remote_code=True
+        )
+        if torch.cuda.is_available() and llm_device.lower().startswith("cuda"):
+            # 根据当前设备GPU数量决定是否进行多卡部署
+            num_gpus = torch.cuda.device_count()
+            if num_gpus < 2 and device_map is None:
+                self.model = (
+                    AutoModel.from_pretrained(
+                        model_name_or_path,
+                        trust_remote_code=True,
+                        **kwargs)
+                    .half()
+                    .cuda()
+                )
+            else:
+                from accelerate import dispatch_model
+                model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, **kwargs).half()
+                # 可传入device_map自定义每张卡的部署情况
+                if device_map is None:
+                    device_map = auto_configure_device_map(num_gpus)
+                self.model = dispatch_model(model, device_map=device_map)
+        else:
+            self.model = (
+                AutoModel.from_pretrained(
+                    model_name_or_path,
+                    trust_remote_code=True)
+                .float()
+                .to(llm_device)
+            )
+        self.model = self.model.eval()