Christoph Holthaus commited on
Commit
a2232d8
1 Parent(s): 493f720
Files changed (1) hide show
  1. app.py +9 -3
app.py CHANGED
@@ -5,10 +5,13 @@ from time import time
5
  import gradio as gr
6
  import psutil
7
 
 
 
 
 
8
  # Initing things
9
- print("debug: init model")
10
  #llm = Llama(model_path="./model.bin") # LLaMa model
11
- llama_model_name = "TheBloke/dolphin-2.2.1-AshhLimaRP-Mistral-7B-GGUF"
12
  print("! INITING DONE !")
13
 
14
  # Preparing things to work
@@ -45,13 +48,14 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
45
 
46
  #download model here
47
  # check localstorage, if no there, load, else use existing.
 
48
 
49
  if torch.cuda.is_available():
50
  model_id = "mistralai/Mistral-7B-Instruct-v0.1"
51
  model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
52
  tokenizer = AutoTokenizer.from_pretrained(model_id)
53
 
54
-
55
  def generate(
56
  message: str,
57
  chat_history: list[tuple[str, str]],
@@ -133,6 +137,7 @@ chat_interface = gr.ChatInterface(
133
  ),
134
  ],
135
  stop_btn=None,
 
136
  examples=[
137
  ["Hello there! How are you doing?"],
138
  ["Can you explain briefly to me what is the Python programming language?"],
@@ -149,6 +154,7 @@ with gr.Blocks(css="style.css") as demo:
149
  value="Duplicate Space for private use",
150
  elem_id="duplicate-button",
151
  visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
 
152
  )
153
  chat_interface.render()
154
 
 
5
  import gradio as gr
6
  import psutil
7
 
8
+ # load like this - use tne variable everywhere
9
+ model_path=os.getenv("MODEL_PATH")
10
+ # show warning, when empty and briefs description of how to set it
11
+
12
  # Initing things
13
+ print(f"debug: init model: {model_path}")
14
  #llm = Llama(model_path="./model.bin") # LLaMa model
 
15
  print("! INITING DONE !")
16
 
17
  # Preparing things to work
 
48
 
49
  #download model here
50
  # check localstorage, if no there, load, else use existing.
51
+ # check gradio - how does it dl? is there a function we can use?
52
 
53
  if torch.cuda.is_available():
54
  model_id = "mistralai/Mistral-7B-Instruct-v0.1"
55
  model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
56
  tokenizer = AutoTokenizer.from_pretrained(model_id)
57
 
58
+ # we need to make sure we only run one thread or we probably run out of ram
59
  def generate(
60
  message: str,
61
  chat_history: list[tuple[str, str]],
 
137
  ),
138
  ],
139
  stop_btn=None,
140
+ # add more eval examples, like a long list taken from teknium and others maybe group by type
141
  examples=[
142
  ["Hello there! How are you doing?"],
143
  ["Can you explain briefly to me what is the Python programming language?"],
 
154
  value="Duplicate Space for private use",
155
  elem_id="duplicate-button",
156
  visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
157
+ # add
158
  )
159
  chat_interface.render()
160