Fix for multiple graphics cards

#2
Files changed (2) hide show
  1. README.md +1 -4
  2. app-multi-alpha.py +0 -210
README.md CHANGED
@@ -2,7 +2,7 @@
2
 
3
  ## Overview
4
 
5
- This application generates descriptive captions for images using advanced ML models. It processes single images or entire directories, leveraging CLIP and LLM models for accurate and contextual captions. It has NSFW captioning support with natural language. This is just an extension of the original author's efforts to improve performance. Their repo is located here: https://huggingface.co/spaces/fancyfeast/joy-caption-pre-alpha.
6
 
7
  ## Features
8
 
@@ -44,8 +44,6 @@ git clone https://huggingface.co/Wi-zz/joy-caption-pre-alpha
44
  cd joy-caption-pre-alpha
45
  python -m venv venv
46
  .\venv\Scripts\activate
47
- # Change as per https://pytorch.org/get-started/locally/
48
- pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
49
  pip install -r requirements.txt
50
  ```
51
 
@@ -56,7 +54,6 @@ git clone https://huggingface.co/Wi-zz/joy-caption-pre-alpha
56
  cd joy-caption-pre-alpha
57
  python3 -m venv venv
58
  source venv/bin/activate
59
- pip3 install torch torchvision torchaudio
60
  pip3 install -r requirements.txt
61
  ```
62
 
 
2
 
3
  ## Overview
4
 
5
+ This application generates descriptive captions for images using advanced ML models. It processes single images or entire directories, leveraging CLIP and LLM models for accurate and contextual captions. It has NSFW captioning support with natural language. This is just an extension of the original author's efforts to improve performance. Their report is located here: https://huggingface.co/spaces/fancyfeast/joy-caption-pre-alpha.
6
 
7
  ## Features
8
 
 
44
  cd joy-caption-pre-alpha
45
  python -m venv venv
46
  .\venv\Scripts\activate
 
 
47
  pip install -r requirements.txt
48
  ```
49
 
 
54
  cd joy-caption-pre-alpha
55
  python3 -m venv venv
56
  source venv/bin/activate
 
57
  pip3 install -r requirements.txt
58
  ```
59
 
app-multi-alpha.py DELETED
@@ -1,210 +0,0 @@
1
- import torch
2
- import torch.amp.autocast_mode
3
- import torch.distributed as dist
4
- import torch.multiprocessing as mp
5
- from torch.nn.parallel import DistributedDataParallel as DDP
6
- import os
7
- import sys
8
- import logging
9
- import warnings
10
- import argparse
11
- from PIL import Image
12
- from pathlib import Path
13
- from tqdm import tqdm
14
- from torch import nn
15
- from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
16
- from typing import List, Union
17
-
18
- # Constants
19
- CLIP_PATH = "google/siglip-so400m-patch14-384"
20
- VLM_PROMPT = "A descriptive caption for this image:\n"
21
- MODEL_PATH = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
22
- CHECKPOINT_PATH = Path("wpkklhc6")
23
- IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.bmp', '.webp')
24
-
25
- warnings.filterwarnings("ignore", category=UserWarning)
26
- logging.getLogger("transformers").setLevel(logging.ERROR)
27
-
28
- def setup(rank, world_size):
29
- os.environ['MASTER_ADDR'] = 'localhost'
30
- os.environ['MASTER_PORT'] = '12355'
31
- dist.init_process_group("nccl", rank=rank, world_size=world_size)
32
-
33
- def cleanup():
34
- dist.destroy_process_group()
35
-
36
- class ImageAdapter(nn.Module):
37
- def __init__(self, input_features: int, output_features: int):
38
- super().__init__()
39
- self.linear1 = nn.Linear(input_features, output_features)
40
- self.activation = nn.GELU()
41
- self.linear2 = nn.Linear(output_features, output_features)
42
-
43
- def forward(self, vision_outputs: torch.Tensor):
44
- return self.linear2(self.activation(self.linear1(vision_outputs)))
45
-
46
- def load_models(rank):
47
- print(f"Loading CLIP 📎 on GPU {rank}")
48
- clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
49
- clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model.eval().requires_grad_(False).to(rank)
50
-
51
- print(f"Loading tokenizer 🪙 on GPU {rank}")
52
- tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
53
- assert isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)), f"Tokenizer is of type {type(tokenizer)}"
54
-
55
- print(f"Loading LLM 🤖 on GPU {rank}")
56
- text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map={"": rank}, torch_dtype=torch.bfloat16).eval()
57
-
58
- print(f"Loading image adapter 🖼️ on GPU {rank}")
59
- image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size)
60
- image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location=f"cuda:{rank}", weights_only=True))
61
- image_adapter.eval().to(rank)
62
-
63
- return clip_processor, clip_model, tokenizer, text_model, image_adapter
64
-
65
- @torch.no_grad()
66
- def stream_chat(input_images: List[Image.Image], batch_size: int, pbar: tqdm, models: tuple, rank: int) -> List[str]:
67
- clip_processor, clip_model, tokenizer, text_model, image_adapter = models
68
- torch.cuda.empty_cache()
69
- all_captions = []
70
-
71
- for i in range(0, len(input_images), batch_size):
72
- batch = input_images[i:i+batch_size]
73
-
74
- try:
75
- images = clip_processor(images=batch, return_tensors='pt', padding=True).pixel_values.to(rank)
76
- except ValueError as e:
77
- print(f"Error processing image batch: {e}")
78
- print("Skipping this batch and continuing...")
79
- continue
80
-
81
- with torch.amp.autocast_mode.autocast(device_type='cuda', enabled=True):
82
- vision_outputs = clip_model(pixel_values=images, output_hidden_states=True)
83
- image_features = vision_outputs.hidden_states[-2]
84
- embedded_images = image_adapter(image_features).to(dtype=torch.bfloat16)
85
-
86
- prompt = tokenizer.encode(VLM_PROMPT, return_tensors='pt')
87
- prompt_embeds = text_model.model.embed_tokens(prompt.to(rank)).to(dtype=torch.bfloat16)
88
- embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=rank, dtype=torch.int64)).to(dtype=torch.bfloat16)
89
-
90
- inputs_embeds = torch.cat([
91
- embedded_bos.expand(embedded_images.shape[0], -1, -1),
92
- embedded_images,
93
- prompt_embeds.expand(embedded_images.shape[0], -1, -1),
94
- ], dim=1).to(dtype=torch.bfloat16)
95
-
96
- input_ids = torch.cat([
97
- torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long).expand(embedded_images.shape[0], -1),
98
- torch.zeros((embedded_images.shape[0], embedded_images.shape[1]), dtype=torch.long),
99
- prompt.expand(embedded_images.shape[0], -1),
100
- ], dim=1).to(rank)
101
-
102
- attention_mask = torch.ones_like(input_ids)
103
-
104
- generate_ids = text_model.generate(
105
- input_ids=input_ids,
106
- inputs_embeds=inputs_embeds,
107
- attention_mask=attention_mask,
108
- max_new_tokens=300,
109
- do_sample=True,
110
- top_k=10,
111
- temperature=0.5,
112
- )
113
-
114
- generate_ids = generate_ids[:, input_ids.shape[1]:]
115
-
116
- for ids in generate_ids:
117
- caption = tokenizer.decode(ids[:-1] if ids[-1] == tokenizer.eos_token_id else ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
118
- caption = caption.replace('<|end_of_text|>', '').replace('<|finetune_right_pad_id|>', '').strip()
119
- all_captions.append(caption)
120
-
121
- if pbar and rank == 0:
122
- pbar.update(len(batch))
123
-
124
- return all_captions
125
-
126
- def process_directory(rank, world_size, input_dir: Path, output_dir: Path, batch_size: int, models: tuple):
127
- output_dir.mkdir(parents=True, exist_ok=True)
128
- image_files = [f for f in input_dir.iterdir() if f.suffix.lower() in IMAGE_EXTENSIONS]
129
- images_to_process = [f for f in image_files if not (output_dir / f"{f.stem}.txt").exists()]
130
-
131
- if not images_to_process:
132
- if rank == 0:
133
- print("No new images to process.")
134
- return
135
-
136
- # Distribute images across GPUs
137
- images_per_gpu = len(images_to_process) // world_size
138
- start_idx = rank * images_per_gpu
139
- end_idx = start_idx + images_per_gpu if rank < world_size - 1 else len(images_to_process)
140
- gpu_images = images_to_process[start_idx:end_idx]
141
-
142
- if rank == 0:
143
- pbar = tqdm(total=len(images_to_process), desc="Processing images", unit="image")
144
- else:
145
- pbar = None
146
-
147
- for i in range(0, len(gpu_images), batch_size):
148
- batch_files = gpu_images[i:i+batch_size]
149
- batch_images = [Image.open(f).convert('RGB') for f in batch_files]
150
-
151
- captions = stream_chat(batch_images, batch_size, pbar, models, rank)
152
-
153
- for file, caption in zip(batch_files, captions):
154
- with open(output_dir / f"{file.stem}.txt", 'w', encoding='utf-8') as f:
155
- f.write(caption)
156
-
157
- for img in batch_images:
158
- img.close()
159
-
160
- if rank == 0:
161
- pbar.close()
162
-
163
- def parse_arguments():
164
- parser = argparse.ArgumentParser(description="Process images and generate captions.")
165
- parser.add_argument("input", nargs='+', help="Input image file or directory (or multiple directories)")
166
- parser.add_argument("--output", help="Output directory (optional)")
167
- parser.add_argument("--bs", type=int, default=4, help="Batch size (default: 4)")
168
- return parser.parse_args()
169
-
170
- def run(rank, world_size, args):
171
- setup(rank, world_size)
172
-
173
- input_paths = [Path(input_path) for input_path in args.input]
174
- batch_size = args.bs
175
- models = load_models(rank)
176
-
177
- for input_path in input_paths:
178
- if input_path.is_file() and input_path.suffix.lower() in IMAGE_EXTENSIONS:
179
- if rank == 0:
180
- output_path = input_path.with_suffix('.txt')
181
- print(f"Processing single image 🎞️: {input_path.name}")
182
- with tqdm(total=1, desc="Processing image", unit="image") as pbar:
183
- captions = stream_chat([Image.open(input_path).convert('RGB')], 1, pbar, models, rank)
184
- with open(output_path, 'w', encoding='utf-8') as f:
185
- f.write(captions[0])
186
- print(f"Output saved to {output_path}")
187
- elif input_path.is_dir():
188
- output_path = Path(args.output) if args.output else input_path
189
- if rank == 0:
190
- print(f"Processing directory 📁: {input_path}")
191
- print(f"Output directory 📦: {output_path}")
192
- print(f"Batch size 🗄️: {batch_size}")
193
- process_directory(rank, world_size, input_path, output_path, batch_size, models)
194
- else:
195
- if rank == 0:
196
- print(f"Invalid input: {input_path}")
197
- print("Skipping...")
198
-
199
- cleanup()
200
-
201
- def main():
202
- args = parse_arguments()
203
- world_size = torch.cuda.device_count()
204
- if world_size > 1:
205
- mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
206
- else:
207
- run(0, 1, args)
208
-
209
- if __name__ == "__main__":
210
- main()