Phi-3-vision-128k

Running on Zero

App Files Files Community

MaziyarPanahi commited on May 22

Commit

49611ce

•

1 Parent(s): 7d910e3

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -38

app.py CHANGED Viewed

@@ -11,11 +11,15 @@ from transformers import TextIteratorStreamer
 import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 PLACEHOLDER = """
 <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
-   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">microsoft/Phi-3-vision-128k-instruct</h1>
 </div>
 """
 user_prompt = '<|user|>\n'
 assistant_prompt = '<|assistant|>\n'
 prompt_suffix = "<|end|>\n"
@@ -37,7 +41,8 @@ model.to("cuda:0")
 @spaces.GPU
 def bot_streaming(message, history):
-    print(message)
     if message["files"]:
         # message["files"][-1] is a Dict or just a string
         if type(message["files"][-1]) == dict:
@@ -53,54 +58,48 @@ def bot_streaming(message, history):
     try:
         if image is None:
             # Handle the case where image is None
-            gr.Error("You need to upload an image for Phi-3-vision to work.")
     except NameError:
         # Handle the case where 'image' is not defined at all
-        gr.Error("You need to upload an image for Phi-3-vision to work.")
-    # prompt = f"{message['text']}<|image_1|>\nCan you convert the table to markdown format?{prompt_suffix}{assistant_prompt}"
-    chat = [
-        {"role": "user", "content": f"<|image_1|>\n{message['text']}"},
-    ]
-    prompt = processor.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
-    # need to remove last <|endoftext|> if it is there, which is used for training, not inference. For training, make sure to add <|endoftext|> in the end.
-    if prompt.endswith("<|endoftext|>"):
-        prompt = prompt.rstrip("<|endoftext|>")
-    print(f">>> Prompt\n{prompt})")
     image = Image.open(image)
-    inputs = processor(prompt, [image], return_tensors='pt').to("cuda:0")
-    streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": False, "skip_prompt": True})
-    generation_kwargs = dict(
-        inputs,
-        streamer=streamer,
-        max_new_tokens=1024,
-        do_sample=False,
-        temperature=0.0,
-        eos_token_id=processor.tokenizer.eos_token_id
-    )
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
-    time.sleep(0.5)
     for new_text in streamer:
-        # if "<|endoftext|>" in new_text:
-            # break
         buffer += new_text
-        generated_text_without_prompt = buffer
-        # print(generated_text_without_prompt)
-        time.sleep(0.06)
-        # print(f"new_text: {generated_text_without_prompt}")
-        yield generated_text_without_prompt
-chatbot = gr.Chatbot(placeholder=PLACEHOLDER, scale=1, height=550)
 chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...",
                                   show_label=False)
 with gr.Blocks(fill_height=True, ) as demo:

 import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+# thanks to https://huggingface.co/ysharma
 PLACEHOLDER = """
 <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
+   <img src="https://cdn-thumbnails.huggingface.co/social-thumbnails/models/microsoft/Phi-3-vision-128k-instruct.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55;  ">
+   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Microsoft's Phi3-Vision-128k-Context</h1>
+   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Phi-3-Vision is a 4.2B parameter multimodal model that brings together language and vision capabilities.</p>
 </div>
 """
 user_prompt = '<|user|>\n'
 assistant_prompt = '<|assistant|>\n'
 prompt_suffix = "<|end|>\n"
 @spaces.GPU
 def bot_streaming(message, history):
+    print(f'message is - {message}')
+    print(f'history is - {history}')
     if message["files"]:
         # message["files"][-1] is a Dict or just a string
         if type(message["files"][-1]) == dict:
     try:
         if image is None:
             # Handle the case where image is None
+            raise gr.Error("You need to upload an image for Phi3-Vision to work. Close the error and try again with an Image.")
     except NameError:
         # Handle the case where 'image' is not defined at all
+        raise gr.Error("You need to upload an image for Phi3-Vision to work. Close the error and try again with an Image.")
+    conversation = []
+    flag=False
+    for user, assistant in history:
+        if assistant is None:
+            #pass
+            flag=True
+            conversation.extend([{"role": "user", "content":""}])
+            continue
+        if flag==True:
+            conversation[0]['content'] = f"<|image_1|>\n{user}"
+            conversation.extend([{"role": "assistant", "content": assistant}])
+            flag=False
+            continue
+        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    if len(history) == 0:
+        conversation.append({"role": "user", "content": f"<|image_1|>\n{message['text']}"})
+    else:
+        conversation.append({"role": "user", "content": message['text']})
+    print(f"prompt is -\n{conversation}")
+    prompt = processor.tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
     image = Image.open(image)
+    inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
+    streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": True, "skip_prompt": True, 'clean_up_tokenization_spaces':False,})
+    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, do_sample=False, temperature=0.0, eos_token_id=processor.tokenizer.eos_token_id,)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
+        yield buffer
+chatbot = gr.Chatbot(placeholder=PLACEHOLDER, scale=1)
 chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...",
                                   show_label=False)
 with gr.Blocks(fill_height=True, ) as demo: