Update README.md
Browse files
README.md
CHANGED
@@ -2,8 +2,6 @@
|
|
2 |
language: en
|
3 |
license: mit
|
4 |
---
|
5 |
-
# Under testing
|
6 |
-
|
7 |
# Kosmos-2.5
|
8 |
|
9 |
[Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
|
@@ -18,41 +16,32 @@ Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive
|
|
18 |
Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
|
19 |
|
20 |
## Use with transformers:
|
21 |
-
```bash
|
22 |
-
pip install git+https://github.com/tic-top/transformers.git
|
23 |
-
```
|
24 |
```python
|
25 |
from PIL import Image
|
26 |
import requests
|
27 |
import torch
|
28 |
-
from transformers import AutoProcessor,
|
29 |
import re
|
30 |
-
|
31 |
-
repo = "kirp/kosmos2_5"
|
32 |
device = "cuda:0"
|
33 |
dtype = torch.bfloat16
|
34 |
-
model =
|
35 |
processor = AutoProcessor.from_pretrained(repo)
|
36 |
-
|
37 |
url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
|
38 |
image = Image.open(requests.get(url, stream=True).raw)
|
39 |
prompt = "<ocr>" # <md>
|
40 |
-
|
41 |
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
42 |
height, width = inputs.pop("height"), inputs.pop("width")
|
43 |
raw_width, raw_height = image.size
|
44 |
scale_height = raw_height / height
|
45 |
scale_width = raw_width / width
|
46 |
-
|
47 |
inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
|
48 |
inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
|
49 |
-
|
50 |
generated_ids = model.generate(
|
51 |
**inputs,
|
52 |
max_new_tokens=1024,
|
53 |
)
|
54 |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
55 |
-
|
56 |
def postprocess(y, scale_height, scale_width):
|
57 |
y = y.replace(prompt, "")
|
58 |
if "<md>" in prompt:
|
@@ -73,7 +62,6 @@ def postprocess(y, scale_height, scale_width):
|
|
73 |
y1 = int(y1 * scale_height)
|
74 |
info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
|
75 |
return info
|
76 |
-
|
77 |
output_text = postprocess(generated_text[0], scale_height, scale_width)
|
78 |
print(output_text)
|
79 |
```
|
@@ -115,4 +103,3 @@ The content of this project itself is licensed under the [MIT](https://github.co
|
|
115 |
[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
|
116 |
|
117 |
|
118 |
-
|
|
|
2 |
language: en
|
3 |
license: mit
|
4 |
---
|
|
|
|
|
5 |
# Kosmos-2.5
|
6 |
|
7 |
[Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
|
|
|
16 |
Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
|
17 |
|
18 |
## Use with transformers:
|
|
|
|
|
|
|
19 |
```python
|
20 |
from PIL import Image
|
21 |
import requests
|
22 |
import torch
|
23 |
+
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
|
24 |
import re
|
25 |
+
repo = "microsoft/kosmos-2.5"
|
|
|
26 |
device = "cuda:0"
|
27 |
dtype = torch.bfloat16
|
28 |
+
model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
|
29 |
processor = AutoProcessor.from_pretrained(repo)
|
|
|
30 |
url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
|
31 |
image = Image.open(requests.get(url, stream=True).raw)
|
32 |
prompt = "<ocr>" # <md>
|
|
|
33 |
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
34 |
height, width = inputs.pop("height"), inputs.pop("width")
|
35 |
raw_width, raw_height = image.size
|
36 |
scale_height = raw_height / height
|
37 |
scale_width = raw_width / width
|
|
|
38 |
inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
|
39 |
inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
|
|
|
40 |
generated_ids = model.generate(
|
41 |
**inputs,
|
42 |
max_new_tokens=1024,
|
43 |
)
|
44 |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
|
|
45 |
def postprocess(y, scale_height, scale_width):
|
46 |
y = y.replace(prompt, "")
|
47 |
if "<md>" in prompt:
|
|
|
62 |
y1 = int(y1 * scale_height)
|
63 |
info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
|
64 |
return info
|
|
|
65 |
output_text = postprocess(generated_text[0], scale_height, scale_width)
|
66 |
print(output_text)
|
67 |
```
|
|
|
103 |
[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
|
104 |
|
105 |
|
|