Sawon2023 commited on
Commit
68c32d7
1 Parent(s): c78d4ac

Q&A Generator from PDF (Text not Image)

Browse files
Files changed (4) hide show
  1. .env +1 -0
  2. README.md +28 -13
  3. app.py +37 -0
  4. pdftoqa_generator.py +72 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY="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"
README.md CHANGED
@@ -1,13 +1,28 @@
1
- ---
2
- title: Llm Pdf Qa
3
- emoji: 📉
4
- colorFrom: green
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 3.44.4
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Make Question and Answer from your PDF
2
+
3
+ ### Setup Environment:
4
+
5
+ 1. Create an account in https://openai.com/ and generate your own API_KEY
6
+
7
+ 2. Download the following libraries and packages:
8
+ a. !pip install langchain
9
+ b. !pip install pypdf
10
+ c. !pip install transformers==4.33.1
11
+ This particular package will install the following dependencies:
12
+ 1. huggingface-hub-0.17.1
13
+ 2. safetensors-0.3.3
14
+ 3. tokenizers-0.13.3
15
+ d. !pip install gradio
16
+
17
+ ### Run the System
18
+
19
+ 1. Run the file:
20
+ ```
21
+ python3 app.py
22
+ ```
23
+ 2. Copy the url from terminal and paste in the browser
24
+
25
+ 3. Upload your PDF & Get the Questions from each page of the PDF
26
+
27
+
28
+
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ import statistics
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ from pdftoqa_generator import *
9
+
10
+
11
+ def predict(file):
12
+ resource = pdf_parser(file)
13
+
14
+ qa_notes = qa_generator(resource)
15
+
16
+ return qa_notes
17
+
18
+
19
+ description = """Do you have a long document and a bunch of questions that can be answered given the data in this file?
20
+ Fear not for this demo is for you.
21
+ Upload your pdf, ask your questions and wait for the magic to happen.
22
+ DISCLAIMER: I do no have idea what happens to the pdfs that you upload and who has access to them so make sure there is nothing confidential there.
23
+ """
24
+ title = "QA answering from a pdf."
25
+
26
+ iface = gr.Interface(
27
+ fn=predict,
28
+ inputs=[
29
+ gr.inputs.File(),
30
+ ],
31
+ outputs="text",
32
+ description=description,
33
+ title=title,
34
+ allow_screenshot=True,
35
+ )
36
+
37
+ iface.launch(enable_queue=True, show_error=True)
pdftoqa_generator.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ import statistics
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ from langchain.document_loaders import PyPDFLoader
9
+ from langchain.text_splitter import (
10
+ CharacterTextSplitter,
11
+ RecursiveCharacterTextSplitter,
12
+ )
13
+ from tqdm import tqdm
14
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
15
+
16
+ os.environ["OPENAI_API_KEY"] = "sk-"
17
+
18
+
19
+ def pdf_parser(file_path):
20
+ pdf_loader = PyPDFLoader(file_path)
21
+
22
+ documents = pdf_loader.load()
23
+ documents_text = [d.page_content for d in documents]
24
+
25
+ text_splitter = RecursiveCharacterTextSplitter(
26
+ # Set a really small chunk size, just to show.
27
+ chunk_size=600,
28
+ chunk_overlap=200,
29
+ length_function=len,
30
+ is_separator_regex=False,
31
+ )
32
+
33
+ # Split the text into chunks
34
+ texts = text_splitter.create_documents(documents_text)
35
+
36
+ return texts
37
+
38
+
39
+ def qa_generator(texts):
40
+ question_tokenizer = AutoTokenizer.from_pretrained(
41
+ "potsawee/t5-large-generation-squad-QuestionAnswer"
42
+ )
43
+ question_model = AutoModelForSeq2SeqLM.from_pretrained(
44
+ "potsawee/t5-large-generation-squad-QuestionAnswer"
45
+ )
46
+
47
+ question_answer_dic = {}
48
+ for i in tqdm(texts):
49
+
50
+ context = i.page_content
51
+ try:
52
+ inputs = question_tokenizer(context, return_tensors="pt")
53
+ outputs = question_model.generate(**inputs, max_length=100)
54
+ question_answer = question_tokenizer.decode(
55
+ outputs[0], skip_special_tokens=False
56
+ )
57
+ question_answer = question_answer.replace(
58
+ question_tokenizer.pad_token, ""
59
+ ).replace(question_tokenizer.eos_token, "")
60
+ question, answer = question_answer.split(question_tokenizer.sep_token)
61
+
62
+ question_answer_dic[question] = answer
63
+ except:
64
+ print(i)
65
+
66
+ qa_notes_df = pd.DataFrame(data=[], columns=["No", "Question", "Answer"])
67
+ qa_notes_df["No"] = [i + 1 for i in range(0, len(question_answer_dic))]
68
+ qa_notes_df["Question"] = [k for k in question_answer_dic.keys()]
69
+ qa_notes_df["Answer"] = [a for a in question_answer_dic.values()]
70
+ qa_notes_json = qa_notes_df.to_dict("records")
71
+
72
+ return qa_notes_json