openbmb
/

MiniCPM3-4B-GPTQ-Int4

@@ -1,5 +1,9 @@
 ---
 license: apache-2.0
 ---
 <div align="center">
 <img src="https://github.com/OpenBMB/MiniCPM/tree/main/assets/minicpm_logo.png" width="500em" ></img>
@@ -16,7 +20,7 @@ Join us in <a href="https://discord.gg/3cGQn9b3YM" target="_blank">Discord</a> a
 ## Introduction
 MiniCPM3-4B is the 3rd generation of MiniCPM series. The overall performance of MiniCPM3-4B surpasses Phi-3.5-mini-Instruct and GPT-3.5-Turbo-0125, being comparable with many recent 7B~9B models.
-Compared to MiniCPM1.0/MiniCPM2.0, MiniCPM3-4B has a more powerful and versatile skill set to enable more general usage. MiniCPM3-4B supports function call, along with code interpreter. Please refer to [Advanced Features](https://github.com/zh-zheng/minicpm?tab=readme-ov-file#%E8%BF%9B%E9%98%B6%E5%8A%9F%E8%83%BD) for usage guidelines.
 MiniCPM3-4B has a 32k context window. Equipped with LLMxMapReduce, MiniCPM3-4B can handle infinite context theoretically, without requiring huge amount of memory.
@@ -25,18 +29,14 @@ MiniCPM3-4B has a 32k context window. Equipped with LLMxMapReduce, MiniCPM3-4B c
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-path = "openbmb/MiniCPM3-4B-GPTQ-int4"
 device = "cuda"
 tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map=device, trust_remote_code=True)
 messages = [
     {"role": "user", "content": "推荐5个北京的景点。"},
 ]
 model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
 model_outputs = model.generate(
     model_inputs,
     max_new_tokens=1024,
@@ -44,11 +44,9 @@ model_outputs = model.generate(
     temperature=0.7,
     repetition_penalty=1.02
 )
 output_token_ids = [
     model_outputs[i][len(model_inputs[i]):] for i in range(len(model_inputs))
 ]
 responses = tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)[0]
 print(responses)
 ```
@@ -57,23 +55,18 @@ print(responses)
 ```python
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
-model_name = "openbmb/MiniCPM3-4B-GPTQ-int4"
 prompt = [{"role": "user", "content": "推荐5个北京的景点。"}]
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 input_text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
 llm = LLM(
     model=model_name,
     trust_remote_code=True,
     tensor_parallel_size=1,
-    quantization='gptq'
 )
 sampling_params = SamplingParams(top_p=0.7, temperature=0.7, max_tokens=1024, repetition_penalty=1.02)
 outputs = llm.generate(prompts=input_text, sampling_params=sampling_params)
 print(outputs[0].outputs[0].text)
 ```
@@ -270,7 +263,6 @@ print(outputs[0].outputs[0].text)
     </tr>
 </table>
 ## Statement
 * As a language model, MiniCPM3-4B generates content by learning from a vast amount of text.
 * However, it does not possess the ability to comprehend or express personal opinions or value judgments.

 ---
 license: apache-2.0
+language:
+- zh
+- en
+pipeline_tag: text-generation
 ---
 <div align="center">
 <img src="https://github.com/OpenBMB/MiniCPM/tree/main/assets/minicpm_logo.png" width="500em" ></img>
 ## Introduction
 MiniCPM3-4B is the 3rd generation of MiniCPM series. The overall performance of MiniCPM3-4B surpasses Phi-3.5-mini-Instruct and GPT-3.5-Turbo-0125, being comparable with many recent 7B~9B models.
+Compared to MiniCPM1.0/MiniCPM2.0, MiniCPM3-4B has a more powerful and versatile skill set to enable more general usage. MiniCPM3-4B supports function call, along with code interpreter. Please refer to [Advanced Features](https://github.com/OpenBMB/MiniCPM/tree/main?tab=readme-ov-file#%E8%BF%9B%E9%98%B6%E5%8A%9F%E8%83%BD) for usage guidelines.
 MiniCPM3-4B has a 32k context window. Equipped with LLMxMapReduce, MiniCPM3-4B can handle infinite context theoretically, without requiring huge amount of memory.
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
+path = "openbmb/MiniCPM3-4B-GPTQ-Int4"
 device = "cuda"
 tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map=device, trust_remote_code=True)
 messages = [
     {"role": "user", "content": "推荐5个北京的景点。"},
 ]
 model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
 model_outputs = model.generate(
     model_inputs,
     max_new_tokens=1024,
     temperature=0.7,
     repetition_penalty=1.02
 )
 output_token_ids = [
     model_outputs[i][len(model_inputs[i]):] for i in range(len(model_inputs))
 ]
 responses = tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)[0]
 print(responses)
 ```
 ```python
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
+model_name = "openbmb/MiniCPM3-4B-GPTQ-Int4"
 prompt = [{"role": "user", "content": "推荐5个北京的景点。"}]
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 input_text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
 llm = LLM(
     model=model_name,
     trust_remote_code=True,
     tensor_parallel_size=1,
+    quantization='gptq',
 )
 sampling_params = SamplingParams(top_p=0.7, temperature=0.7, max_tokens=1024, repetition_penalty=1.02)
 outputs = llm.generate(prompts=input_text, sampling_params=sampling_params)
 print(outputs[0].outputs[0].text)
 ```
     </tr>
 </table>
 ## Statement
 * As a language model, MiniCPM3-4B generates content by learning from a vast amount of text.
 * However, it does not possess the ability to comprehend or express personal opinions or value judgments.