Spaces:

muhtasham
/

agent

Sleeping

App Files Files Community

muhtasham commited on Jun 4

Commit

9339181

•

1 Parent(s): 92a09c6

chore: Update Dockerfile to improve build process and upgrade pip

Browse files

Files changed (3) hide show

Dockerfile +0 -3
README.md +2 -1
main.py +18 -15

Dockerfile CHANGED Viewed

@@ -1,9 +1,6 @@
 # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
 # you will also find guides on how best to write your Dockerfile
-# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
-# you will also find guides on how best to write your Dockerfile
 # Use the official Python 3.10 slim image as the base image
 FROM tiangolo/uvicorn-gunicorn:python3.10-slim

 # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
 # you will also find guides on how best to write your Dockerfile
 # Use the official Python 3.10 slim image as the base image
 FROM tiangolo/uvicorn-gunicorn:python3.10-slim

README.md CHANGED Viewed

@@ -9,4 +9,5 @@ pinned: false
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
-SOTA open VLM is [InternVL-1.5](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard), which is 22B, for practical deployment I choose moondream which is a model can answer real-world questions about images (378x378). It's tiny by today's models, with only 1.6B parameters. That enables it to run on a variety of devices, including mobile phones and edge devices.


9
10	Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
11
12	+ SOTA open VLM is [InternVL-1.5](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard), which is 22B, for practical deployment I choose moondream which is a model can answer real-world questions about images (378x378). It's tiny by today's models, with only 1.6B parameters. That enables it to run on a variety of devices, including mobile phones and edge devices.
13	+

main.py CHANGED Viewed

@@ -4,15 +4,13 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from PIL import Image
 from openai import AsyncOpenAI
 from pydantic import BaseModel
-from fastapi.logger import logger
 import io
 import os
 import multion
 import torch
 import instructor
-import openai
 from multion.client import MultiOn
 from dotenv import load_dotenv
@@ -20,26 +18,27 @@ from dotenv import load_dotenv
 load_dotenv()
 multion = MultiOn(api_key=os.environ.get("MULTION_API_KEY"))
-logger.info("MultiOn API key loaded")
 app = FastAPI()
 device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
-logger.info(f"Device: {device}")
 model_id = "vikhyatk/moondream2"
 revision = "2024-05-20"
 model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, revision=revision).to(device)
-logger.info(f"Model loaded: {model_id} to {device}")
 model = torch.compile(model)
-logger.info(f"Model compiled: {model_id} to {device}")
 tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
-logger.info(f"Tokenizer loaded: {model_id}")
 client = instructor.from_openai(AsyncOpenAI(
     # This is the default and can be omitted
     api_key=os.environ.get("OPENAI_API_KEY"),
 ))
 class MultiOnInputBrowse(BaseModel):
     """
@@ -80,13 +79,17 @@ async def process_image_file(file: UploadFile) -> str:
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/process-input/")
 async def process_input(text: str = Form(...), file: UploadFile = File(None)):
     if file is not None:
         try:
-            logger.info("Processing image file")
             image_description = await process_image_file(file)
-            logger.info(f"Image description: {image_description}")
         except HTTPException as e:
             raise e
     else:
@@ -99,18 +102,18 @@ async def process_input(text: str = Form(...), file: UploadFile = File(None)):
     else:
         processed_text = text
-    logger.info(f"Processed text: {processed_text}")
     command = await generate_command(processed_text)
-    logger.info(f"Command generated: {command.message}")
     try:
-        logger.info("Calling MultiOn API")
         response = multion.browse(
             cmd=command.cmd,
             url=command.url,
             local=command.local
         )
-        logger.info(f"Response received: {response.message}")
         return JSONResponse(content={"response": response.message, "command": command.model_dump()})
     except Exception as e:
@@ -119,7 +122,7 @@ async def process_input(text: str = Form(...), file: UploadFile = File(None)):
 async def generate_command(content: str) -> MultiOnInputBrowse:
     try:
-        response = await openai.ChatCompletion.create(
             model="gpt-4o",
             messages=[
                 {

 from PIL import Image
 from openai import AsyncOpenAI
 from pydantic import BaseModel
+from rich import print
 import io
 import os
 import multion
 import torch
 import instructor
 from multion.client import MultiOn
 from dotenv import load_dotenv
 load_dotenv()
 multion = MultiOn(api_key=os.environ.get("MULTION_API_KEY"))
+print("MultiOn API key loaded")
 app = FastAPI()
 device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
+print(f"Device: {device}")
 model_id = "vikhyatk/moondream2"
 revision = "2024-05-20"
 model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, revision=revision).to(device)
+print(f"Model loaded: {model_id} to {device}")
 model = torch.compile(model)
+print(f"Model compiled: {model_id} to {device}")
 tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
+print(f"Tokenizer loaded: {model_id}")
 client = instructor.from_openai(AsyncOpenAI(
     # This is the default and can be omitted
     api_key=os.environ.get("OPENAI_API_KEY"),
 ))
+print("OpenAI API key loaded and client initialized")
 class MultiOnInputBrowse(BaseModel):
     """
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+@app.get("/")
+def read_root():
+    return {"Hello": "World"}
 @app.post("/process-input/")
 async def process_input(text: str = Form(...), file: UploadFile = File(None)):
     if file is not None:
         try:
+            print("Processing image file")
             image_description = await process_image_file(file)
+            print(f"Image description: {image_description}")
         except HTTPException as e:
             raise e
     else:
     else:
         processed_text = text
+    print(f"Processed text: {processed_text}")
     command = await generate_command(processed_text)
+    print(f"Command generated: {command.message}")
     try:
+        print("Calling MultiOn API")
         response = multion.browse(
             cmd=command.cmd,
             url=command.url,
             local=command.local
         )
+        print(f"Response received: {response.message}")
         return JSONResponse(content={"response": response.message, "command": command.model_dump()})
     except Exception as e:
 async def generate_command(content: str) -> MultiOnInputBrowse:
     try:
+        response = await client.chat.completions.create(
             model="gpt-4o",
             messages=[
                 {