muhtasham commited on
Commit
9339181
1 Parent(s): 92a09c6

chore: Update Dockerfile to improve build process and upgrade pip

Browse files
Files changed (3) hide show
  1. Dockerfile +0 -3
  2. README.md +2 -1
  3. main.py +18 -15
Dockerfile CHANGED
@@ -1,9 +1,6 @@
1
  # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
  # you will also find guides on how best to write your Dockerfile
3
 
4
- # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
5
- # you will also find guides on how best to write your Dockerfile
6
-
7
  # Use the official Python 3.10 slim image as the base image
8
  FROM tiangolo/uvicorn-gunicorn:python3.10-slim
9
 
 
1
  # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
  # you will also find guides on how best to write your Dockerfile
3
 
 
 
 
4
  # Use the official Python 3.10 slim image as the base image
5
  FROM tiangolo/uvicorn-gunicorn:python3.10-slim
6
 
README.md CHANGED
@@ -9,4 +9,5 @@ pinned: false
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
11
 
12
- SOTA open VLM is [InternVL-1.5](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard), which is 22B, for practical deployment I choose moondream which is a model can answer real-world questions about images (378x378). It's tiny by today's models, with only 1.6B parameters. That enables it to run on a variety of devices, including mobile phones and edge devices.
 
 
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
11
 
12
+ SOTA open VLM is [InternVL-1.5](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard), which is *22B*, for practical deployment I choose moondream which is a model can answer real-world questions about images (378x378). It's tiny by today's models, with only *1.6B* parameters. That enables it to run on a variety of devices, including mobile phones and edge devices.
13
+
main.py CHANGED
@@ -4,15 +4,13 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
4
  from PIL import Image
5
  from openai import AsyncOpenAI
6
  from pydantic import BaseModel
7
- from fastapi.logger import logger
8
 
9
  import io
10
  import os
11
  import multion
12
  import torch
13
  import instructor
14
- import openai
15
-
16
  from multion.client import MultiOn
17
  from dotenv import load_dotenv
18
 
@@ -20,26 +18,27 @@ from dotenv import load_dotenv
20
  load_dotenv()
21
 
22
  multion = MultiOn(api_key=os.environ.get("MULTION_API_KEY"))
23
- logger.info("MultiOn API key loaded")
24
 
25
  app = FastAPI()
26
 
27
  device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
28
- logger.info(f"Device: {device}")
29
 
30
  model_id = "vikhyatk/moondream2"
31
  revision = "2024-05-20"
32
  model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, revision=revision).to(device)
33
- logger.info(f"Model loaded: {model_id} to {device}")
34
  model = torch.compile(model)
35
- logger.info(f"Model compiled: {model_id} to {device}")
36
  tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
37
- logger.info(f"Tokenizer loaded: {model_id}")
38
 
39
  client = instructor.from_openai(AsyncOpenAI(
40
  # This is the default and can be omitted
41
  api_key=os.environ.get("OPENAI_API_KEY"),
42
  ))
 
43
 
44
  class MultiOnInputBrowse(BaseModel):
45
  """
@@ -80,13 +79,17 @@ async def process_image_file(file: UploadFile) -> str:
80
  except Exception as e:
81
  raise HTTPException(status_code=500, detail=str(e))
82
 
 
 
 
 
83
  @app.post("/process-input/")
84
  async def process_input(text: str = Form(...), file: UploadFile = File(None)):
85
  if file is not None:
86
  try:
87
- logger.info("Processing image file")
88
  image_description = await process_image_file(file)
89
- logger.info(f"Image description: {image_description}")
90
  except HTTPException as e:
91
  raise e
92
  else:
@@ -99,18 +102,18 @@ async def process_input(text: str = Form(...), file: UploadFile = File(None)):
99
  else:
100
  processed_text = text
101
 
102
- logger.info(f"Processed text: {processed_text}")
103
  command = await generate_command(processed_text)
104
- logger.info(f"Command generated: {command.message}")
105
 
106
  try:
107
- logger.info("Calling MultiOn API")
108
  response = multion.browse(
109
  cmd=command.cmd,
110
  url=command.url,
111
  local=command.local
112
  )
113
- logger.info(f"Response received: {response.message}")
114
  return JSONResponse(content={"response": response.message, "command": command.model_dump()})
115
 
116
  except Exception as e:
@@ -119,7 +122,7 @@ async def process_input(text: str = Form(...), file: UploadFile = File(None)):
119
 
120
  async def generate_command(content: str) -> MultiOnInputBrowse:
121
  try:
122
- response = await openai.ChatCompletion.create(
123
  model="gpt-4o",
124
  messages=[
125
  {
 
4
  from PIL import Image
5
  from openai import AsyncOpenAI
6
  from pydantic import BaseModel
7
+ from rich import print
8
 
9
  import io
10
  import os
11
  import multion
12
  import torch
13
  import instructor
 
 
14
  from multion.client import MultiOn
15
  from dotenv import load_dotenv
16
 
 
18
  load_dotenv()
19
 
20
  multion = MultiOn(api_key=os.environ.get("MULTION_API_KEY"))
21
+ print("MultiOn API key loaded")
22
 
23
  app = FastAPI()
24
 
25
  device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
26
+ print(f"Device: {device}")
27
 
28
  model_id = "vikhyatk/moondream2"
29
  revision = "2024-05-20"
30
  model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, revision=revision).to(device)
31
+ print(f"Model loaded: {model_id} to {device}")
32
  model = torch.compile(model)
33
+ print(f"Model compiled: {model_id} to {device}")
34
  tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
35
+ print(f"Tokenizer loaded: {model_id}")
36
 
37
  client = instructor.from_openai(AsyncOpenAI(
38
  # This is the default and can be omitted
39
  api_key=os.environ.get("OPENAI_API_KEY"),
40
  ))
41
+ print("OpenAI API key loaded and client initialized")
42
 
43
  class MultiOnInputBrowse(BaseModel):
44
  """
 
79
  except Exception as e:
80
  raise HTTPException(status_code=500, detail=str(e))
81
 
82
+ @app.get("/")
83
+ def read_root():
84
+ return {"Hello": "World"}
85
+
86
  @app.post("/process-input/")
87
  async def process_input(text: str = Form(...), file: UploadFile = File(None)):
88
  if file is not None:
89
  try:
90
+ print("Processing image file")
91
  image_description = await process_image_file(file)
92
+ print(f"Image description: {image_description}")
93
  except HTTPException as e:
94
  raise e
95
  else:
 
102
  else:
103
  processed_text = text
104
 
105
+ print(f"Processed text: {processed_text}")
106
  command = await generate_command(processed_text)
107
+ print(f"Command generated: {command.message}")
108
 
109
  try:
110
+ print("Calling MultiOn API")
111
  response = multion.browse(
112
  cmd=command.cmd,
113
  url=command.url,
114
  local=command.local
115
  )
116
+ print(f"Response received: {response.message}")
117
  return JSONResponse(content={"response": response.message, "command": command.model_dump()})
118
 
119
  except Exception as e:
 
122
 
123
  async def generate_command(content: str) -> MultiOnInputBrowse:
124
  try:
125
+ response = await client.chat.completions.create(
126
  model="gpt-4o",
127
  messages=[
128
  {