Spaces:
Runtime error
Runtime error
Nihal D'Souza
commited on
Commit
•
1fdb52f
1
Parent(s):
a804ced
This commit fixes the extractive error
Browse files- app.py +7 -7
- src/textrank.py +9 -6
app.py
CHANGED
@@ -27,7 +27,7 @@ if summarization_type == 'Abstractive':
|
|
27 |
st.sidebar.caption('Summary will be generated by the T5 Transformer Model')
|
28 |
elif summarization_type == 'Extractive':
|
29 |
st.sidebar.caption('Summary will be generated by a custom TextRank Algorithm')
|
30 |
-
summary_len = st.sidebar.slider('Summary length percentage', 1,
|
31 |
elif summarization_type == 'Both':
|
32 |
st.sidebar.caption('The License text will be first passed through the custom TextRank algorithm and then passed on to the T5 Transformer Model to generate a summary.')
|
33 |
|
@@ -41,14 +41,10 @@ if len(input) > 0:
|
|
41 |
if summarization_type == 'Abstractive':
|
42 |
summary, definitions = summarize_text_with_model(input, model, tokenizer)
|
43 |
if summarization_type == 'Extractive':
|
44 |
-
summary, definitions = custom_textrank_summarizer(input, summary_len = summary_len/
|
45 |
if summarization_type == 'Both':
|
46 |
summary, definitions = summarize_text_with_model(input, model, tokenizer)
|
47 |
summary, _ = custom_textrank_summarizer(summary, summary_len = 1)
|
48 |
-
|
49 |
-
if clean_text:
|
50 |
-
st.header('Cleaned License Text')
|
51 |
-
st.write(clean_license_text(input)[0])
|
52 |
|
53 |
st.header('Summary')
|
54 |
st.write(summary)
|
@@ -59,5 +55,9 @@ if len(input) > 0:
|
|
59 |
|
60 |
if definitions:
|
61 |
st.header('Definitions')
|
62 |
-
st.write(definitions)
|
|
|
|
|
|
|
|
|
63 |
|
|
|
27 |
st.sidebar.caption('Summary will be generated by the T5 Transformer Model')
|
28 |
elif summarization_type == 'Extractive':
|
29 |
st.sidebar.caption('Summary will be generated by a custom TextRank Algorithm')
|
30 |
+
summary_len = st.sidebar.slider('Summary length percentage', 1, 100, 30)
|
31 |
elif summarization_type == 'Both':
|
32 |
st.sidebar.caption('The License text will be first passed through the custom TextRank algorithm and then passed on to the T5 Transformer Model to generate a summary.')
|
33 |
|
|
|
41 |
if summarization_type == 'Abstractive':
|
42 |
summary, definitions = summarize_text_with_model(input, model, tokenizer)
|
43 |
if summarization_type == 'Extractive':
|
44 |
+
summary, definitions = custom_textrank_summarizer(input, summary_len = summary_len/100)
|
45 |
if summarization_type == 'Both':
|
46 |
summary, definitions = summarize_text_with_model(input, model, tokenizer)
|
47 |
summary, _ = custom_textrank_summarizer(summary, summary_len = 1)
|
|
|
|
|
|
|
|
|
48 |
|
49 |
st.header('Summary')
|
50 |
st.write(summary)
|
|
|
55 |
|
56 |
if definitions:
|
57 |
st.header('Definitions')
|
58 |
+
st.write(definitions)
|
59 |
+
|
60 |
+
if clean_text:
|
61 |
+
st.header('Cleaned License Text')
|
62 |
+
st.write(clean_license_text(input)[0])
|
63 |
|
src/textrank.py
CHANGED
@@ -8,8 +8,6 @@ from collections import Counter
|
|
8 |
from src.clean import clean_license_text
|
9 |
from src.read_data import read_file
|
10 |
|
11 |
-
nltk.download('punkt')
|
12 |
-
|
13 |
properties_dict = {
|
14 |
"modify":['modify', 'modification', 'change'],
|
15 |
"distribute":['distribute', 'distribution'],
|
@@ -37,14 +35,19 @@ def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, de
|
|
37 |
'''
|
38 |
TODO: Doctrings
|
39 |
'''
|
40 |
-
summary_len = math.ceil(summary_len*len(license_text.split('.')))
|
41 |
sent_scores = {}
|
42 |
cleaned_license_text, definitions = clean_license_text(license_text)
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
if debug:
|
45 |
print(i.split())
|
46 |
if len(i.split()) < min_sent_len:
|
47 |
-
|
48 |
score = 0
|
49 |
for prop, prop_words in properties_dict.items():
|
50 |
prop_score = 0
|
@@ -52,7 +55,7 @@ def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, de
|
|
52 |
word_count = Counter([tok for tok in lemmatized_tokens])
|
53 |
for prop_word in prop_words:
|
54 |
if prop_word in word_count.keys():
|
55 |
-
prop_score += properties_scores[
|
56 |
if debug:
|
57 |
print(prop, "=", prop_score)
|
58 |
score += prop_score
|
|
|
8 |
from src.clean import clean_license_text
|
9 |
from src.read_data import read_file
|
10 |
|
|
|
|
|
11 |
properties_dict = {
|
12 |
"modify":['modify', 'modification', 'change'],
|
13 |
"distribute":['distribute', 'distribution'],
|
|
|
35 |
'''
|
36 |
TODO: Doctrings
|
37 |
'''
|
|
|
38 |
sent_scores = {}
|
39 |
cleaned_license_text, definitions = clean_license_text(license_text)
|
40 |
+
cleaned_license_sentences = cleaned_license_text.split('.')
|
41 |
+
summary_len = math.ceil(summary_len*len(cleaned_license_sentences))
|
42 |
+
if debug:
|
43 |
+
print(f'summary length:{summary_len}')
|
44 |
+
if debug:
|
45 |
+
print(cleaned_license_sentences)
|
46 |
+
for i in cleaned_license_sentences:
|
47 |
if debug:
|
48 |
print(i.split())
|
49 |
if len(i.split()) < min_sent_len:
|
50 |
+
continue
|
51 |
score = 0
|
52 |
for prop, prop_words in properties_dict.items():
|
53 |
prop_score = 0
|
|
|
55 |
word_count = Counter([tok for tok in lemmatized_tokens])
|
56 |
for prop_word in prop_words:
|
57 |
if prop_word in word_count.keys():
|
58 |
+
prop_score += properties_scores[prop]
|
59 |
if debug:
|
60 |
print(prop, "=", prop_score)
|
61 |
score += prop_score
|