PereLluis13 commited on
Commit
1501279
1 Parent(s): cbf1f0e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +19 -14
README.md CHANGED
@@ -32,23 +32,25 @@ from transformers import pipeline
32
 
33
  triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')
34
  # We need to use the tokenizer manually since we need special tokens.
35
- extracted_text = triplet_extractor.tokenizer.decode(triplet_extractor("Punta Cana is a resort town in the municipality of Higuey, in La Altagracia Province, the eastern most province of the Dominican Republic", return_tensors=True, return_text=False)[0]["generated_token_ids"])
36
- print(extracted_text)
37
  # Function to parse the generated text and extract the triplets
38
  def extract_triplets(text):
39
  triplets = []
40
- relation = ''
41
- for token in text.split():
 
 
42
  if token == "<triplet>":
43
  current = 't'
44
  if relation != '':
45
- triplets.append((subject, relation, object_))
46
  relation = ''
47
  subject = ''
48
  elif token == "<subj>":
49
  current = 's'
50
  if relation != '':
51
- triplets.append((subject, relation, object_))
52
  object_ = ''
53
  elif token == "<obj>":
54
  current = 'o'
@@ -60,32 +62,34 @@ def extract_triplets(text):
60
  object_ += ' ' + token
61
  elif current == 'o':
62
  relation += ' ' + token
63
- triplets.append((subject, relation, object_))
 
64
  return triplets
65
- extracted_triplets = extract_triplets(extracted_text)
66
  print(extracted_triplets)
67
  ```
68
 
69
  ## Model and Tokenizer using transformers
70
 
71
  ```python3
72
-
73
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
74
 
75
  def extract_triplets(text):
76
  triplets = []
77
- relation = ''
78
- for token in text.split():
 
 
79
  if token == "<triplet>":
80
  current = 't'
81
  if relation != '':
82
- triplets.append((subject, relation, object_))
83
  relation = ''
84
  subject = ''
85
  elif token == "<subj>":
86
  current = 's'
87
  if relation != '':
88
- triplets.append((subject, relation, object_))
89
  object_ = ''
90
  elif token == "<obj>":
91
  current = 'o'
@@ -97,7 +101,8 @@ def extract_triplets(text):
97
  object_ += ' ' + token
98
  elif current == 'o':
99
  relation += ' ' + token
100
- triplets.append((subject, relation, object_))
 
101
  return triplets
102
 
103
  # Load model and tokenizer
 
32
 
33
  triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')
34
  # We need to use the tokenizer manually since we need special tokens.
35
+ extracted_text = triplet_extractor.tokenizer.batch_decode(triplet_extractor("Punta Cana is a resort town in the municipality of Higuey, in La Altagracia Province, the eastern most province of the Dominican Republic", return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"])
36
+ print(extracted_text[0])
37
  # Function to parse the generated text and extract the triplets
38
  def extract_triplets(text):
39
  triplets = []
40
+ relation, subject, relation, object_ = '', '', '', ''
41
+ text = text.strip()
42
+ current = 'x'
43
+ for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
44
  if token == "<triplet>":
45
  current = 't'
46
  if relation != '':
47
+ triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
48
  relation = ''
49
  subject = ''
50
  elif token == "<subj>":
51
  current = 's'
52
  if relation != '':
53
+ triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
54
  object_ = ''
55
  elif token == "<obj>":
56
  current = 'o'
 
62
  object_ += ' ' + token
63
  elif current == 'o':
64
  relation += ' ' + token
65
+ if subject != '' and relation != '' and object_ != '':
66
+ triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
67
  return triplets
68
+ extracted_triplets = extract_triplets(extracted_text[0])
69
  print(extracted_triplets)
70
  ```
71
 
72
  ## Model and Tokenizer using transformers
73
 
74
  ```python3
 
75
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
76
 
77
  def extract_triplets(text):
78
  triplets = []
79
+ relation, subject, relation, object_ = '', '', '', ''
80
+ text = text.strip()
81
+ current = 'x'
82
+ for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
83
  if token == "<triplet>":
84
  current = 't'
85
  if relation != '':
86
+ triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
87
  relation = ''
88
  subject = ''
89
  elif token == "<subj>":
90
  current = 's'
91
  if relation != '':
92
+ triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
93
  object_ = ''
94
  elif token == "<obj>":
95
  current = 'o'
 
101
  object_ += ' ' + token
102
  elif current == 'o':
103
  relation += ' ' + token
104
+ if subject != '' and relation != '' and object_ != '':
105
+ triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
106
  return triplets
107
 
108
  # Load model and tokenizer