Christoph Holthaus commited on
Commit
576474f
1 Parent(s): def624a
Files changed (1) hide show
  1. app.py +30 -19
app.py CHANGED
@@ -1,20 +1,39 @@
1
- #!/usr/bin/env python#
 
 
 
 
 
2
 
3
- from llama_cpp import Llama
4
- from time import time
5
  import gradio as gr
6
  import psutil
7
- import os
 
 
 
 
8
 
9
  # load like this - use tne variable everywhere
10
- model_hf_path=os.getenv("MODEL_HF_PATH")
 
 
11
  # show warning, when empty and briefs description of how to set it
12
  # also add link to "how to search" with link to bloke by default + example search link + example full value (mistral base?)
13
  # info about ram requirements
14
 
15
  # Initing things
16
- print(f"debug: init model: {model_hf_path}")
17
- #llm = Llama(model_path="./model.bin") # LLaMa model
 
 
 
 
 
 
 
 
 
 
18
  print("! INITING DONE !")
19
 
20
  # Preparing things to work
@@ -31,27 +50,18 @@ print(f"DEBUG: Memory free: {psutil.virtual_memory().free / (1024.0 ** 3)} GiB")
31
  print(f"DEBUG: Memory available: {psutil.virtual_memory().available / (1024.0 ** 3)} GiB")
32
  print(f"DEBUG: Memory: {psutil.virtual_memory().total / (1024.0 ** 3)} GiB")
33
 
34
-
35
- from threading import Thread
36
- from typing import Iterator
37
-
38
- import spaces
39
- import torch
40
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
41
-
42
- DESCRIPTION = f"# Test model: {model_hf_path}"
43
 
44
  if torch.cuda.is_available():
45
  DESCRIPTION += "\n<p>This space is using CPU only. Use a different one if you want to go fast and use GPU. </p>"
46
 
 
47
  MAX_MAX_NEW_TOKENS = 2048
48
  DEFAULT_MAX_NEW_TOKENS = 1024
49
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
50
 
51
 
52
- #download model here
53
- # check localstorage, if no there, load, else use existing.
54
- # check gradio - how does it dl? is there a function we can use?
55
 
56
  if torch.cuda.is_available():
57
  model_id = "mistralai/Mistral-7B-Instruct-v0.1"
@@ -79,6 +89,7 @@ def generate(
79
  gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
80
  input_ids = input_ids.to(model.device)
81
 
 
82
  streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
83
  generate_kwargs = dict(
84
  {"input_ids": input_ids},
 
1
+ #!/usr/bin/env python
2
+
3
+ import os
4
+ import requests
5
+ from threading import Thread
6
+ from typing import Iterator
7
 
 
 
8
  import gradio as gr
9
  import psutil
10
+ import spaces
11
+ import torch
12
+ from time import time
13
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
14
+ from llama_cpp import Llama
15
 
16
  # load like this - use tne variable everywhere
17
+ model_uri_hf=os.getenv("MODEL_URI_HF")
18
+ # DEBUG!
19
+ model_uri_hf="https://huggingface.co/TheBloke/neural-chat-7B-v3-2-GGUF/blob/main/neural-chat-7b-v3-2.Q2_K.gguf"
20
  # show warning, when empty and briefs description of how to set it
21
  # also add link to "how to search" with link to bloke by default + example search link + example full value (mistral base?)
22
  # info about ram requirements
23
 
24
  # Initing things
25
+ print(f"debug: init model: {model_uri_hf}")
26
+
27
+ # Check if the model file already exists
28
+ if not os.path.isfile('model.bin'):
29
+ # Download the model
30
+ response = requests.get(model_uri_hf)
31
+
32
+ # Save the model to a local file
33
+ with open('model.bin', 'wb') as file:
34
+ file.write(response.content)
35
+
36
+ llm = Llama(model_path="./model.bin") # LLaMa model
37
  print("! INITING DONE !")
38
 
39
  # Preparing things to work
 
50
  print(f"DEBUG: Memory available: {psutil.virtual_memory().available / (1024.0 ** 3)} GiB")
51
  print(f"DEBUG: Memory: {psutil.virtual_memory().total / (1024.0 ** 3)} GiB")
52
 
53
+ DESCRIPTION = f"# Test model: {model_uri_hf}"
 
 
 
 
 
 
 
 
54
 
55
  if torch.cuda.is_available():
56
  DESCRIPTION += "\n<p>This space is using CPU only. Use a different one if you want to go fast and use GPU. </p>"
57
 
58
+ #todo - probably lower. like 200 in and maybe 500 out? Should be ok for quick test
59
  MAX_MAX_NEW_TOKENS = 2048
60
  DEFAULT_MAX_NEW_TOKENS = 1024
61
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
62
 
63
 
64
+
 
 
65
 
66
  if torch.cuda.is_available():
67
  model_id = "mistralai/Mistral-7B-Instruct-v0.1"
 
89
  gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
90
  input_ids = input_ids.to(model.device)
91
 
92
+ streamer= Llama()
93
  streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
94
  generate_kwargs = dict(
95
  {"input_ids": input_ids},