Awiny commited on
Commit
44a0c32
β€’
1 Parent(s): 40adb4f

full cpu support

Browse files
app.py CHANGED
@@ -20,7 +20,9 @@ parser.add_argument('--contolnet_device', choices=['cuda', 'cpu'], default='cpu'
20
 
21
  args = parser.parse_args()
22
 
23
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
24
  if device == "cuda":
25
  args.image_caption_device = "cuda"
26
  args.dense_caption_device = "cuda"
@@ -45,24 +47,43 @@ def add_logo():
45
 
46
  def process_image(image_src, options, processor):
47
  processor.args.semantic_segment = "Semantic Segment" in options
48
- gen_text = processor.image_to_text(image_src)
49
- gen_image = processor.text_to_image(gen_text)
50
- gen_image_str = pil_image_to_base64(gen_image)
 
 
51
  # Combine the outputs into a single HTML output
52
  custom_output = f'''
53
- <h2>Image->Text->Image:</h2>
54
  <div style="display: flex; flex-wrap: wrap;">
55
  <div style="flex: 1;">
56
- <h3>Image2Text</h3>
57
- <p>{gen_text}</p>
58
  </div>
59
  <div style="flex: 1;">
60
- <h3>Text2Image</h3>
61
- <img src="data:image/jpeg;base64,{gen_image_str}" width="100%" />
 
 
 
 
 
 
 
 
62
  </div>
63
  </div>
64
  '''
65
-
 
 
 
 
 
 
 
 
 
66
  return custom_output
67
 
68
  processor = ImageTextTransformation(args)
@@ -70,6 +91,7 @@ processor = ImageTextTransformation(args)
70
  # Create Gradio input and output components
71
  image_input = gr.inputs.Image(type='filepath', label="Input Image")
72
  semantic_segment_checkbox = gr.inputs.Checkbox(label="Semantic Segment", default=False)
 
73
 
74
  logo_base64 = add_logo()
75
  # Create the title with the logo
@@ -81,15 +103,16 @@ interface = gr.Interface(
81
  inputs=[image_input,
82
  gr.CheckboxGroup(
83
  label="Options",
84
- choices=["Semantic Segment"],
85
  ),
86
  ],
87
  outputs=gr.outputs.HTML(),
88
  title=title_with_logo,
89
  description="""
90
  This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
91
- \n Semantic segment is very slow in cpu(~8m), best use on gpu or run local.
92
- \n Notice the text2image model is controlnet, which used canny edge as reference.
 
93
  """
94
  )
95
 
 
20
 
21
  args = parser.parse_args()
22
 
23
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
24
+ device = "cpu"
25
+
26
  if device == "cuda":
27
  args.image_caption_device = "cuda"
28
  args.dense_caption_device = "cuda"
 
47
 
48
  def process_image(image_src, options, processor):
49
  processor.args.semantic_segment = "Semantic Segment" in options
50
+ image_generation_status = "Image Generation" in options
51
+ image_caption, dense_caption, region_semantic, gen_text = processor.image_to_text(image_src)
52
+ if image_generation_status:
53
+ gen_image = processor.text_to_image(gen_text)
54
+ gen_image_str = pil_image_to_base64(gen_image)
55
  # Combine the outputs into a single HTML output
56
  custom_output = f'''
57
+ <h2>Image->Text:</h2>
58
  <div style="display: flex; flex-wrap: wrap;">
59
  <div style="flex: 1;">
60
+ <h3>Image Caption</h3>
61
+ <p>{image_caption}</p>
62
  </div>
63
  <div style="flex: 1;">
64
+ <h3>Dense Caption</h3>
65
+ <p>{dense_caption}</p>
66
+ </div>
67
+ <div style="flex: 1;">
68
+ <h3>Region Semantic</h3>
69
+ <p>{region_semantic}</p>
70
+ </div>
71
+ <div style="flex: 1;">
72
+ <h3>GPT4 Reasoning:</h3>
73
+ <p>{gen_text}</p>
74
  </div>
75
  </div>
76
  '''
77
+ if image_generation_status:
78
+ custom_output += f'''
79
+ <h2>Text->Image:</h2>
80
+ <div style="display: flex; flex-wrap: wrap;">
81
+ <div style="flex: 1;">
82
+ <h3>Generated Image</h3>
83
+ <img src="data:image/jpeg;base64,{gen_image_str}" width="400" style="vertical-align: middle;">
84
+ </div>
85
+ </div>
86
+ '''
87
  return custom_output
88
 
89
  processor = ImageTextTransformation(args)
 
91
  # Create Gradio input and output components
92
  image_input = gr.inputs.Image(type='filepath', label="Input Image")
93
  semantic_segment_checkbox = gr.inputs.Checkbox(label="Semantic Segment", default=False)
94
+ image_generation_checkbox = gr.inputs.Checkbox(label="Image Generation", default=False)
95
 
96
  logo_base64 = add_logo()
97
  # Create the title with the logo
 
103
  inputs=[image_input,
104
  gr.CheckboxGroup(
105
  label="Options",
106
+ choices=["Semantic Segment", "Image Generation"],
107
  ),
108
  ],
109
  outputs=gr.outputs.HTML(),
110
  title=title_with_logo,
111
  description="""
112
  This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
113
+ \n Since GPU is expensive, we use CPU for demo. Run code local with gpu or google colab we provided for fast speed.
114
+ \n Semantic segment is very slow in cpu(~8m).
115
+ \n Ttext2image model is controlnet is also very slow in cpu(~2m), which used canny edge as reference.
116
  """
117
  )
118
 
models/__pycache__/controlnet_model.cpython-38.pyc CHANGED
Binary files a/models/__pycache__/controlnet_model.cpython-38.pyc and b/models/__pycache__/controlnet_model.cpython-38.pyc differ
 
models/__pycache__/image_text_transformation.cpython-38.pyc CHANGED
Binary files a/models/__pycache__/image_text_transformation.cpython-38.pyc and b/models/__pycache__/image_text_transformation.cpython-38.pyc differ
 
models/controlnet_model.py CHANGED
@@ -15,21 +15,28 @@ class TextToImage:
15
  self.model = self.initialize_model()
16
 
17
  def initialize_model(self):
 
 
 
 
18
  controlnet = ControlNetModel.from_pretrained(
19
  "fusing/stable-diffusion-v1-5-controlnet-canny",
20
- torch_dtype=torch.float16,
21
- )
 
22
  pipeline = StableDiffusionControlNetPipeline.from_pretrained(
23
  "runwayml/stable-diffusion-v1-5",
24
  controlnet=controlnet,
25
  safety_checker=None,
26
- torch_dtype=torch.float16,
 
27
  )
28
  pipeline.scheduler = UniPCMultistepScheduler.from_config(
29
  pipeline.scheduler.config
30
  )
31
- pipeline.enable_model_cpu_offload()
32
  pipeline.to(self.device)
 
 
33
  return pipeline
34
 
35
  @staticmethod
 
15
  self.model = self.initialize_model()
16
 
17
  def initialize_model(self):
18
+ if self.device == 'cpu':
19
+ self.data_type = torch.float32
20
+ else:
21
+ self.data_type = torch.float16
22
  controlnet = ControlNetModel.from_pretrained(
23
  "fusing/stable-diffusion-v1-5-controlnet-canny",
24
+ torch_dtype=self.data_type,
25
+ map_location=self.device, # Add this line
26
+ ).to(self.device)
27
  pipeline = StableDiffusionControlNetPipeline.from_pretrained(
28
  "runwayml/stable-diffusion-v1-5",
29
  controlnet=controlnet,
30
  safety_checker=None,
31
+ torch_dtype=self.data_type,
32
+ map_location=self.device, # Add this line
33
  )
34
  pipeline.scheduler = UniPCMultistepScheduler.from_config(
35
  pipeline.scheduler.config
36
  )
 
37
  pipeline.to(self.device)
38
+ if self.device != 'cpu':
39
+ pipeline.enable_model_cpu_offload()
40
  return pipeline
41
 
42
  @staticmethod
models/image_text_transformation.py CHANGED
@@ -55,7 +55,7 @@ class ImageTextTransformation:
55
  else:
56
  region_semantic = " "
57
  generated_text = self.gpt_model.paragraph_summary_with_gpt(image_caption, dense_caption, region_semantic, width, height)
58
- return generated_text
59
 
60
  def text_to_image(self, text):
61
  generated_image = self.controlnet_model.text_to_image(text, self.ref_image)
 
55
  else:
56
  region_semantic = " "
57
  generated_text = self.gpt_model.paragraph_summary_with_gpt(image_caption, dense_caption, region_semantic, width, height)
58
+ return image_caption, dense_caption, region_semantic, generated_text
59
 
60
  def text_to_image(self, text):
61
  generated_image = self.controlnet_model.text_to_image(text, self.ref_image)