File size: 5,228 Bytes
3becbbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, TextStreamer
from threading import Thread
import gradio as gr
from peft import PeftModel

model_name_or_path = "sarvamai/OpenHathi-7B-Hi-v0.1-Base"
peft_model_id = "shuvom/OpenHathi-7B-FT-v0.1_SI"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_4bit=True, device_map="auto")

# tokenizer.chat_template = chat_template
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
# make embedding resizing configurable?
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)

model = PeftModel.from_pretrained(model, peft_model_id)

class ChatCompletion:
  def __init__(self, model, tokenizer, system_prompt=None):
    self.model = model
    self.tokenizer = tokenizer
    self.streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True)
    self.print_streamer = TextStreamer(self.tokenizer, skip_prompt=True)
    # set the model in inference mode
    self.model.eval()
    self.system_prompt = system_prompt

  def get_completion(self, prompt, system_prompt=None, message_history=None, max_new_tokens=512, temperature=0.0):
    if temperature < 1e-2:
      temperature = 1e-2
    messages = []
    if message_history is not None:
      messages.extend(message_history)
    elif system_prompt or self.system_prompt:
      system_prompt = system_prompt or self.system_prompt
      messages.append({"role": "system", "content":system_prompt})
    messages.append({"role": "user", "content": prompt})
    chat_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = self.tokenizer(chat_prompt, return_tensors="pt", add_special_tokens=False)
    # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
    generation_kwargs = dict(max_new_tokens=max_new_tokens,
                             temperature=temperature,
                             top_p=0.95,
                             do_sample=True,
                             eos_token_id=tokenizer.eos_token_id,
                             repetition_penalty=1.2
                             )
    generated_text = self.model.generate(**inputs, streamer=self.print_streamer, **generation_kwargs)
    return generated_text

  def get_chat_completion(self, message, history):
    messages = []
    if self.system_prompt:
      messages.append({"role": "system", "content":self.system_prompt})
    for user_message, assistant_message in history:
        messages.append({"role": "user", "content": user_message})
        messages.append({"role": "system", "content": assistant_message})
    messages.append({"role": "user", "content": message})
    chat_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = self.tokenizer(chat_prompt, return_tensors="pt")
    # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
    generation_kwargs = dict(inputs,
                             streamer=self.streamer,
                             max_new_tokens=2048,
                             temperature=0.2,
                             top_p=0.95,
                             eos_token_id=tokenizer.eos_token_id,
                             do_sample=True,
                             repetition_penalty=1.2,
                             )
    thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
    thread.start()
    generated_text = ""
    for new_text in self.streamer:
        generated_text += new_text.replace(self.tokenizer.eos_token, "")
        yield generated_text
    thread.join()
    return generated_text

  def get_completion_without_streaming(self, prompt, system_prompt=None, message_history=None, max_new_tokens=512, temperature=0.0):
    if temperature < 1e-2:
      temperature = 1e-2
    messages = []
    if message_history is not None:
      messages.extend(message_history)
    elif system_prompt or self.system_prompt:
      system_prompt = system_prompt or self.system_prompt
      messages.append({"role": "system", "content":system_prompt})
    messages.append({"role": "user", "content": prompt})
    chat_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = self.tokenizer(chat_prompt, return_tensors="pt", add_special_tokens=False)
    # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
    generation_kwargs = dict(max_new_tokens=max_new_tokens,
                             temperature=temperature,
                             top_p=0.95,
                             do_sample=True,
                             repetition_penalty=1.1)
    outputs = self.model.generate(**inputs, **generation_kwargs)
    generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

text_generator = ChatCompletion(model, tokenizer, system_prompt="You are a native Hindi speaker who can converse at expert level in both Hindi and colloquial Hinglish.")

gr.ChatInterface(text_generator.get_chat_completion).queue().launch(debug=True)