ydshieh HF staff commited on
Commit
108ce13
1 Parent(s): ada6118

Update processing_kosmos2.py

Browse files
Files changed (1) hide show
  1. processing_kosmos2.py +6 -3
processing_kosmos2.py CHANGED
@@ -272,13 +272,14 @@ class Kosmos2Processor(ProcessorMixin):
272
  )
273
 
274
  def preprocess_single(text, image, bboxes):
 
275
  if image is not None:
276
  # Add `<image> ... (fake) image tokens ... </image>`
277
  text = f"{img_info} {text}"
278
 
279
- # Add `<object> <patch_idx_xxxx> <patch_idx_yyy> </object>` after `<phrase> phrase text </phrase>`
280
- text = self._insert_patch_index_tokens(text, bboxes)
281
- text = self._add_remove_spaces_around_tag_tokens(text)
282
 
283
  return text
284
 
@@ -418,6 +419,8 @@ class Kosmos2Processor(ProcessorMixin):
418
  )
419
  pattern = "|".join(tag_tokens)
420
  splits = re.split(rf"({pattern})", text)
 
 
421
 
422
  output = ""
423
  prev_str_in_targets = False
 
272
  )
273
 
274
  def preprocess_single(text, image, bboxes):
275
+ text = text.strip()
276
  if image is not None:
277
  # Add `<image> ... (fake) image tokens ... </image>`
278
  text = f"{img_info} {text}"
279
 
280
+ # Add `<object> <patch_idx_xxxx> <patch_idx_yyy> </object>` after `<phrase> phrase text </phrase>`
281
+ text = self._insert_patch_index_tokens(text, bboxes)
282
+ text = self._add_remove_spaces_around_tag_tokens(text)
283
 
284
  return text
285
 
 
419
  )
420
  pattern = "|".join(tag_tokens)
421
  splits = re.split(rf"({pattern})", text)
422
+ # Don't keep the leading and trailing space if any
423
+ splits = [split for idx, split in enumerate(splits) if not (idx in [0, len(splits) - 1] and split == "")]
424
 
425
  output = ""
426
  prev_str_in_targets = False