srinivas-mushroom samarthagarwal23 commited on
Commit
301a986
0 Parent(s):

Duplicate from samarthagarwal23/QuestionAnswering_on_annual_reports

Browse files

Co-authored-by: Samarth <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ dbs-annual-report-2020.pdf filter=lfs diff=lfs merge=lfs -text
NASDAQ_AAPL_2020.pdf ADDED
The diff for this file is too large to render. See raw diff
 
NASDAQ_MSFT_2020.pdf ADDED
Binary file (861 kB). View file
 
README.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: QuestionAnswering_on_annual_reports
3
+ emoji: 🚀
4
+ colorFrom: green
5
+ colorTo: purple
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: false
9
+ duplicated_from: samarthagarwal23/QuestionAnswering_on_annual_reports
10
+ ---
11
+
12
+ # Configuration
13
+
14
+ `title`: _string_
15
+ Display title for the Space
16
+
17
+ `emoji`: _string_
18
+ Space emoji (emoji-only character allowed)
19
+
20
+ `colorFrom`: _string_
21
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
22
+
23
+ `colorTo`: _string_
24
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
25
+
26
+ `sdk`: _string_
27
+ Can be either `gradio`, `streamlit`, or `static`
28
+
29
+ `sdk_version` : _string_
30
+ Only applicable for `streamlit` SDK.
31
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
32
+
33
+ `app_file`: _string_
34
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
35
+ Path is relative to the root of the repository.
36
+
37
+ `models`: _List[string]_
38
+ HF model IDs (like "gpt2" or "deepset/roberta-base-squad2") used in the Space.
39
+ Will be parsed automatically from your code if not specified here.
40
+
41
+ `datasets`: _List[string]_
42
+ HF dataset IDs (like "common_voice" or "oscar-corpus/OSCAR-2109") used in the Space.
43
+ Will be parsed automatically from your code if not specified here.
44
+
45
+ `pinned`: _boolean_
46
+ Whether the Space stays on top of your list.
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import numpy as np
4
+ os.system("pip install pdfminer.six rank_bm25 torch transformers")
5
+
6
+ from gradio.mix import Series
7
+ #import re
8
+ from rank_bm25 import BM25Okapi
9
+ import string
10
+ import torch
11
+ from transformers import pipeline
12
+ import pdfminer
13
+ from pdfminer.high_level import extract_text
14
+
15
+ len_doc = 500
16
+ overlap = 15
17
+ param_top_k_retriver = 15
18
+ param_top_k_ranker = 3
19
+
20
+ def read_pdf(file):
21
+ text = extract_text(file.name)
22
+ # Split text into smaller docs
23
+ docs = []
24
+
25
+ i = 0
26
+ while i < len(text):
27
+ docs.append(text[i:i+len_doc])
28
+ i = i + len_doc - overlap
29
+ return docs
30
+
31
+ # We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching
32
+
33
+ def bm25_tokenizer(text):
34
+ stop_w = ['a', 'the', 'am', 'is' , 'are', 'who', 'how', 'where', 'when', 'why', 'what']
35
+ tokenized_doc = []
36
+ for token in text.lower().split():
37
+ token = token.strip(string.punctuation)
38
+
39
+ if len(token) > 0 and token not in stop_w:
40
+ tokenized_doc.append(token)
41
+ return tokenized_doc
42
+
43
+ def retrieval(query, top_k_retriver, docs, bm25_):
44
+
45
+ bm25_scores = bm25_.get_scores(bm25_tokenizer(query))
46
+ top_n = np.argsort(bm25_scores)[::-1][:top_k_retriver]
47
+ bm25_hits = [{'corpus_id': idx,
48
+ 'score': bm25_scores[idx],
49
+ 'docs':docs[idx]} for idx in top_n if bm25_scores[idx] > 0]
50
+ bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
51
+
52
+ return bm25_hits
53
+
54
+ def qa_ranker(query, docs_, top_k_ranker, qa_model):
55
+ ans = []
56
+ for doc in docs_:
57
+ answer = qa_model(question = query,
58
+ context = doc)
59
+ answer['doc'] = doc
60
+ ans.append(answer)
61
+ return sorted(ans, key=lambda x: x['score'], reverse=True)[:top_k_ranker]
62
+
63
+ def cstr(s, color='black'):
64
+ return "<text style=color:{}>{}</text>".format(color, s)
65
+ def cstr_bold(s, color='black'):
66
+ return "<text style=color:{}><b>{}</b></text>".format(color, s)
67
+ def cstr_break(s, color='black'):
68
+ return "<text style=color:{}><br>{}</text>".format(color, s)
69
+
70
+ def print_colored(text, start_idx, end_idx, confidence):
71
+ conf_str = '- Confidence: ' + confidence
72
+ a = cstr(' '.join([text[:start_idx], \
73
+ cstr_bold(text[start_idx:end_idx], color='blue'), \
74
+ text[end_idx:], \
75
+ cstr_break(conf_str, color='grey')]), color='black')
76
+ return a
77
+
78
+ def final_qa_pipeline(file, query, model_nm):
79
+ docs = read_pdf(file)
80
+ tokenized_corpus = []
81
+ for doc in docs:
82
+ tokenized_corpus.append(bm25_tokenizer(doc))
83
+
84
+ bm25 = BM25Okapi(tokenized_corpus)
85
+
86
+ top_k_retriver, top_k_ranker = param_top_k_retriver, param_top_k_ranker
87
+ lvl1 = retrieval(query, top_k_retriver, docs, bm25)
88
+
89
+ qa_model = pipeline("question-answering",
90
+ #model = "deepset/minilm-uncased-squad2")
91
+ model = "deepset/"+ str(model_nm))
92
+
93
+ if len(lvl1) > 0:
94
+ fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker,qa_model)
95
+ top1 = print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end'], str(np.round(100*fnl_rank[0]["score"],1))+"%")
96
+ if len(lvl1)>1:
97
+ top2 = print_colored(fnl_rank[1]['doc'], fnl_rank[1]['start'], fnl_rank[1]['end'], str(np.round(100*fnl_rank[1]["score"],1))+"%")
98
+ else:
99
+ top2 = "None"
100
+ return (top1, top2)
101
+ else:
102
+ return ("No match","No match")
103
+
104
+ examples = [
105
+ [os.path.abspath("dbs-annual-report-2020.pdf"), "how many times has DBS won Best bank in the world ?","minilm-uncased-squad2"],
106
+ [os.path.abspath("dbs-annual-report-2020.pdf"), "how much dividend was paid to shareholders ?","minilm-uncased-squad2"],
107
+ [os.path.abspath("dbs-annual-report-2020.pdf"), "what is the sustainability focus ?","minilm-uncased-squad2"],
108
+ [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "how much are the outstanding shares ?","minilm-uncased-squad2"],
109
+ [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "what is competitors strategy ?","minilm-uncased-squad2"],
110
+ [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "who is the chief executive officer ?","minilm-uncased-squad2"],
111
+ [os.path.abspath("NASDAQ_MSFT_2020.pdf"), "How much is the guided revenue for next quarter?","minilm-uncased-squad2"],
112
+ ]
113
+
114
+ iface = gr.Interface(
115
+ fn = final_qa_pipeline,
116
+ inputs = [gr.inputs.File(label="input pdf file"), gr.inputs.Textbox(label="Question:"), gr.inputs.Dropdown(choices=["minilm-uncased-squad2","roberta-base-squad2"],label="Model")],
117
+ outputs = [gr.outputs.HTML(label="Top 1 answer"), gr.outputs.HTML(label="Top 2 answer")],
118
+ examples=examples,
119
+ theme = "grass",
120
+ title = "Question Answering on annual reports",
121
+ description = "Navigate long annual reports by using Machine learning to answer your questions. \nSimply upload any annual report pdf you are interested in and ask model a question OR load an example from below."
122
+ )
123
+ iface.launch(enable_queue = True)
dbs-annual-report-2020.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95056380018cf9eb93911ce026783ed99531881271c59a0bbb239fe6354854ee
3
+ size 11581751