sivan22 commited on
Commit
65aba84
โ€ข
1 Parent(s): ba85985

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -4
app.py CHANGED
@@ -9,10 +9,41 @@ for idx, url in enumerate(urls):
9
  image = Image.open(requests.get(url, stream=True).raw)
10
  image.save(f"image_{idx}.png")
11
 
 
 
12
 
13
- image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-base-patch4-window7-224")
14
- tokenizer = tokenizer =BertTokenizerFast.from_pretrained("onlplab/alephbert-base")
15
- model = VisionEncoderDecoderModel.from_pretrained("sivan22/hdd-words-ocr")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
  def process_image(image):
@@ -28,7 +59,7 @@ def process_image(image):
28
  return generated_text
29
 
30
  title = "ื”ื“ื’ืžื”: ืคืขื ื•ื— ื›ืชื‘ ื™ื“ ื‘ืืžืฆืขื•ืช ื‘ื™ื ื” ืžืœืื›ื•ืชื™ืช"
31
- description = "ืขืœ ื‘ืกื™ืก ืžื•ื“ืœ swin ื‘ืฆื“ ื”ืชืžื•ื ื”, ื•ืžื•ื“ืœ alephbert ื‘ืฆื“ ื”ื˜ืงืกื˜."
32
  article = "<p style='text-align: center'>sivan22</p>"
33
  examples =[["image_0.png"], ["image_1.png"], ["image_2.png"]]
34
 
 
9
  image = Image.open(requests.get(url, stream=True).raw)
10
  image.save(f"image_{idx}.png")
11
 
12
+ from transformers import BertTokenizer, BasicTokenizer
13
+ from transformers.tokenization_utils import _is_punctuation
14
 
15
+ class OurBasicTokenizer(BasicTokenizer):
16
+ def _run_split_on_punc(self, text, never_split=None):
17
+ """Splits punctuation on a piece of text."""
18
+ if text in self.never_split or (never_split and text in never_split):
19
+ return [text]
20
+ chars = list(text)
21
+ i = 0
22
+ start_new_word = True
23
+ output = []
24
+ while i < len(chars):
25
+ char = chars[i]
26
+ if _is_punctuation(char) and char != "'" and not (char == '"' and i + 1 < len(chars) and not _is_punctuation(chars[i + 1])):
27
+ output.append([char])
28
+ start_new_word = True
29
+ else:
30
+ if start_new_word:
31
+ output.append([])
32
+ start_new_word = False
33
+ output[-1].append(char)
34
+ i += 1
35
+
36
+ return ["".join(x) for x in output]
37
+
38
+
39
+ def RabbinicTokenizer(tok):
40
+ tok.basic_tokenizer = OurBasicTokenizer(tok.basic_tokenizer.do_lower_case, tok.basic_tokenizer.never_split)
41
+ return tok
42
+
43
+
44
+ image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
45
+ tokenizer = RabbinicTokenizer(BertTokenizer.from_pretrained("sivan22/BEREL"))
46
+ model = VisionEncoderDecoderModel.from_pretrained("sivan22/ABBA-HTR")
47
 
48
 
49
  def process_image(image):
 
59
  return generated_text
60
 
61
  title = "ื”ื“ื’ืžื”: ืคืขื ื•ื— ื›ืชื‘ ื™ื“ ื‘ืืžืฆืขื•ืช ื‘ื™ื ื” ืžืœืื›ื•ืชื™ืช"
62
+ description = "ืขืœ ื‘ืกื™ืก ื˜ื›ื ื•ืœื•ื’ื™ื™ืช trOCR"
63
  article = "<p style='text-align: center'>sivan22</p>"
64
  examples =[["image_0.png"], ["image_1.png"], ["image_2.png"]]
65