Spaces:

Awiny
/

Image2Paragraph

Runtime error

App Files Files Community

Awiny commited on Apr 15, 2023

Commit

44a0c32

•

1 Parent(s): 40adb4f

full cpu support

Browse files

Files changed (5) hide show

app.py +36 -13
models/__pycache__/controlnet_model.cpython-38.pyc +0 -0
models/__pycache__/image_text_transformation.cpython-38.pyc +0 -0
models/controlnet_model.py +11 -4
models/image_text_transformation.py +1 -1

app.py CHANGED Viewed

@@ -20,7 +20,9 @@ parser.add_argument('--contolnet_device', choices=['cuda', 'cpu'], default='cpu'
 args = parser.parse_args()
-device = "cuda" if torch.cuda.is_available() else "cpu"
 if device == "cuda":
     args.image_caption_device = "cuda"
     args.dense_caption_device = "cuda"
@@ -45,24 +47,43 @@ def add_logo():
 def process_image(image_src, options, processor):
     processor.args.semantic_segment = "Semantic Segment" in options
-    gen_text = processor.image_to_text(image_src)
-    gen_image = processor.text_to_image(gen_text)
-    gen_image_str = pil_image_to_base64(gen_image)
     # Combine the outputs into a single HTML output
     custom_output = f'''
-    <h2>Image->Text->Image:</h2>
     <div style="display: flex; flex-wrap: wrap;">
         <div style="flex: 1;">
-            <h3>Image2Text</h3>
-            <p>{gen_text}</p>
         </div>
         <div style="flex: 1;">
-            <h3>Text2Image</h3>
-            <img src="data:image/jpeg;base64,{gen_image_str}" width="100%" />
         </div>
     </div>
     '''
     return custom_output
 processor = ImageTextTransformation(args)
@@ -70,6 +91,7 @@ processor = ImageTextTransformation(args)
 # Create Gradio input and output components
 image_input = gr.inputs.Image(type='filepath', label="Input Image")
 semantic_segment_checkbox = gr.inputs.Checkbox(label="Semantic Segment", default=False)
 logo_base64 = add_logo()
 # Create the title with the logo
@@ -81,15 +103,16 @@ interface = gr.Interface(
     inputs=[image_input,
             gr.CheckboxGroup(
             label="Options",
-            choices=["Semantic Segment"],
             ),
             ],
     outputs=gr.outputs.HTML(),
     title=title_with_logo,
     description="""
     This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
-    \n Semantic segment is very slow in cpu(~8m), best use on gpu or run local.
-    \n Notice the text2image model is controlnet, which used canny edge as reference.
     """
 )

 args = parser.parse_args()
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+device = "cpu"
 if device == "cuda":
     args.image_caption_device = "cuda"
     args.dense_caption_device = "cuda"
 def process_image(image_src, options, processor):
     processor.args.semantic_segment = "Semantic Segment" in options
+    image_generation_status = "Image Generation" in options
+    image_caption, dense_caption, region_semantic, gen_text = processor.image_to_text(image_src)
+    if image_generation_status:
+        gen_image = processor.text_to_image(gen_text)
+        gen_image_str = pil_image_to_base64(gen_image)
     # Combine the outputs into a single HTML output
     custom_output = f'''
+    <h2>Image->Text:</h2>
     <div style="display: flex; flex-wrap: wrap;">
         <div style="flex: 1;">
+            <h3>Image Caption</h3>
+            <p>{image_caption}</p>
         </div>
         <div style="flex: 1;">
+            <h3>Dense Caption</h3>
+            <p>{dense_caption}</p>
+        </div>
+        <div style="flex: 1;">
+            <h3>Region Semantic</h3>
+            <p>{region_semantic}</p>
+        </div>
+        <div style="flex: 1;">
+            <h3>GPT4 Reasoning:</h3>
+            <p>{gen_text}</p>
         </div>
     </div>
     '''
+    if image_generation_status:
+        custom_output += f'''
+        <h2>Text->Image:</h2>
+        <div style="display: flex; flex-wrap: wrap;">
+            <div style="flex: 1;">
+                <h3>Generated Image</h3>
+                <img src="data:image/jpeg;base64,{gen_image_str}" width="400" style="vertical-align: middle;">
+            </div>
+        </div>
+        '''
     return custom_output
 processor = ImageTextTransformation(args)
 # Create Gradio input and output components
 image_input = gr.inputs.Image(type='filepath', label="Input Image")
 semantic_segment_checkbox = gr.inputs.Checkbox(label="Semantic Segment", default=False)
+image_generation_checkbox = gr.inputs.Checkbox(label="Image Generation", default=False)
 logo_base64 = add_logo()
 # Create the title with the logo
     inputs=[image_input,
             gr.CheckboxGroup(
             label="Options",
+            choices=["Semantic Segment", "Image Generation"],
             ),
             ],
     outputs=gr.outputs.HTML(),
     title=title_with_logo,
     description="""
     This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
+    \n Since GPU is expensive, we use CPU for demo. Run code local with gpu or google colab we provided for fast speed.
+    \n Semantic segment is very slow in cpu(~8m).
+    \n Ttext2image model is controlnet is also very slow in cpu(~2m), which used canny edge as reference.
     """
 )

models/__pycache__/controlnet_model.cpython-38.pyc CHANGED Viewed

Binary files a/models/__pycache__/controlnet_model.cpython-38.pyc and b/models/__pycache__/controlnet_model.cpython-38.pyc differ

models/__pycache__/image_text_transformation.cpython-38.pyc CHANGED Viewed

Binary files a/models/__pycache__/image_text_transformation.cpython-38.pyc and b/models/__pycache__/image_text_transformation.cpython-38.pyc differ

models/controlnet_model.py CHANGED Viewed

@@ -15,21 +15,28 @@ class TextToImage:
         self.model = self.initialize_model()
     def initialize_model(self):
         controlnet = ControlNetModel.from_pretrained(
             "fusing/stable-diffusion-v1-5-controlnet-canny",
-            torch_dtype=torch.float16,
-        )
         pipeline = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5",
             controlnet=controlnet,
             safety_checker=None,
-            torch_dtype=torch.float16,
         )
         pipeline.scheduler = UniPCMultistepScheduler.from_config(
             pipeline.scheduler.config
         )
-        pipeline.enable_model_cpu_offload()
         pipeline.to(self.device)
         return pipeline
     @staticmethod

         self.model = self.initialize_model()
     def initialize_model(self):
+        if self.device == 'cpu':
+            self.data_type = torch.float32
+        else:
+            self.data_type = torch.float16
         controlnet = ControlNetModel.from_pretrained(
             "fusing/stable-diffusion-v1-5-controlnet-canny",
+            torch_dtype=self.data_type,
+            map_location=self.device,  # Add this line
+        ).to(self.device)
         pipeline = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5",
             controlnet=controlnet,
             safety_checker=None,
+            torch_dtype=self.data_type,
+            map_location=self.device,  # Add this line
         )
         pipeline.scheduler = UniPCMultistepScheduler.from_config(
             pipeline.scheduler.config
         )
         pipeline.to(self.device)
+        if self.device != 'cpu':
+            pipeline.enable_model_cpu_offload()
         return pipeline
     @staticmethod

models/image_text_transformation.py CHANGED Viewed

@@ -55,7 +55,7 @@ class ImageTextTransformation:
         else:
             region_semantic = " "
         generated_text = self.gpt_model.paragraph_summary_with_gpt(image_caption, dense_caption, region_semantic, width, height)
-        return generated_text
     def text_to_image(self, text):
         generated_image = self.controlnet_model.text_to_image(text, self.ref_image)

         else:
             region_semantic = " "
         generated_text = self.gpt_model.paragraph_summary_with_gpt(image_caption, dense_caption, region_semantic, width, height)
+        return image_caption, dense_caption, region_semantic, generated_text
     def text_to_image(self, text):
         generated_image = self.controlnet_model.text_to_image(text, self.ref_image)