Spaces:
Sleeping
Sleeping
raseel-zymr
commited on
Commit
•
eaf0e00
1
Parent(s):
dd2ca7e
Initial commit with streamlit
Browse files- .gitignore +1 -0
- README.md +15 -1
- app.py +93 -0
- requirements.txt +161 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__
|
README.md
CHANGED
@@ -10,4 +10,18 @@ pinned: false
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
+
# Document Question & Answer
|
14 |
+
A Langchain-based application to upload any text or PDF document, ask relevant Questions to it and expect summarised answers.
|
15 |
+
|
16 |
+
|
17 |
+
### Pre-requisites
|
18 |
+
|
19 |
+
$ pip install langchain huggingface_hub sentence_transformers faiss-cpu unstructured chromadb Cython tiktoken unstructured[local-inference]
|
20 |
+
|
21 |
+
Or
|
22 |
+
|
23 |
+
$ pip install -r requirements.txt
|
24 |
+
|
25 |
+
* Install the above Python packages
|
26 |
+
### Reference:
|
27 |
+
* Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
|
app.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
#for textfiles
|
5 |
+
from langchain.document_loaders import TextLoader
|
6 |
+
#text splitter
|
7 |
+
from langchain.text_splitter import CharacterTextSplitter
|
8 |
+
#for using HugginFace models & embeddings
|
9 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
10 |
+
from langchain import HuggingFaceHub
|
11 |
+
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
|
12 |
+
from langchain.vectorstores import FAISS
|
13 |
+
#facebook vectorization
|
14 |
+
from langchain.chains.question_answering import load_qa_chain
|
15 |
+
#load pdf
|
16 |
+
from langchain.document_loaders import UnstructuredPDFLoader
|
17 |
+
|
18 |
+
os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"]
|
19 |
+
|
20 |
+
st.title('Document Q&A - Ask anything in your Document')
|
21 |
+
st.sidebar.subheader('Upload document')
|
22 |
+
uploaded_file = st.file_uploader("Upload File",type=['txt','pdf'])
|
23 |
+
# url2 = "https://github.com/fabiomatricardi/cdQnA/raw/main/KS-all-info_rev1.txt"
|
24 |
+
# res = requests.get(url2)
|
25 |
+
# with open("KS-all-info_rev1.txt", "w") as f:
|
26 |
+
# f.write(res.text)
|
27 |
+
|
28 |
+
st.subheader('Enter query')
|
29 |
+
query = st.text_input('Ask anything about the Document you uploaded')
|
30 |
+
|
31 |
+
st.subheader('Answer')
|
32 |
+
st.write('Answer from document')
|
33 |
+
|
34 |
+
# # Document Loader
|
35 |
+
# loader = TextLoader('./KS-all-info_rev1.txt')
|
36 |
+
# documents = loader.load()
|
37 |
+
# import textwrap
|
38 |
+
# def wrap_text_preserve_newlines(text, width=110):
|
39 |
+
# # Split the input text into lines based on newline characters
|
40 |
+
# lines = text.split('\n')
|
41 |
+
# # Wrap each line individually
|
42 |
+
# wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
|
43 |
+
# # Join the wrapped lines back together using newline characters
|
44 |
+
# wrapped_text = '\n'.join(wrapped_lines)
|
45 |
+
# return wrapped_text
|
46 |
+
|
47 |
+
# # Text Splitter
|
48 |
+
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
|
49 |
+
# docs = text_splitter.split_documents(documents)
|
50 |
+
|
51 |
+
# # Embeddings
|
52 |
+
# embeddings = HuggingFaceEmbeddings()
|
53 |
+
|
54 |
+
# #Create the vectorized db
|
55 |
+
# db = FAISS.from_documents(docs, embeddings)
|
56 |
+
|
57 |
+
# llm=HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0, "max_length":512})
|
58 |
+
# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
|
59 |
+
# chain = load_qa_chain(llm2, chain_type="stuff")
|
60 |
+
|
61 |
+
# # Sample question
|
62 |
+
# # query = "What the actual issues and drawbacks ?"
|
63 |
+
|
64 |
+
# # docs = db.similarity_search(query)
|
65 |
+
# # chain.run(input_documents=docs, question=query)
|
66 |
+
|
67 |
+
|
68 |
+
# # PDFs
|
69 |
+
# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/PLC_mediumArticle.pdf
|
70 |
+
# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/BridgingTheGaap_fromMedium.pdf
|
71 |
+
# # !mkdir pdfs
|
72 |
+
# # !cp *pdf '/content/pdfs'
|
73 |
+
|
74 |
+
# # pdf_folder_path = '/content/pdfs'
|
75 |
+
# # os.listdir(pdf_folder_path)
|
76 |
+
|
77 |
+
# # loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
|
78 |
+
# # loaders
|
79 |
+
|
80 |
+
# index = VectorstoreIndexCreator(
|
81 |
+
# embedding=HuggingFaceEmbeddings(),
|
82 |
+
# text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)
|
83 |
+
|
84 |
+
# #Load llm with selected one
|
85 |
+
# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
|
86 |
+
# #Prepare the pipeline
|
87 |
+
# from langchain.chains import RetrievalQA
|
88 |
+
# chain = RetrievalQA.from_chain_type(llm=llm2,
|
89 |
+
# chain_type="stuff",
|
90 |
+
# retriever=index.vectorstore.as_retriever(),
|
91 |
+
# input_key="question")
|
92 |
+
# #get reply to our questions
|
93 |
+
# # chain.run('What is the difference between a PLC and a PC?')
|
requirements.txt
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp==3.8.4
|
2 |
+
aiosignal==1.3.1
|
3 |
+
altair==5.0.1
|
4 |
+
antlr4-python3-runtime==4.9.3
|
5 |
+
anyio==3.7.0
|
6 |
+
argilla==1.9.0
|
7 |
+
async-timeout==4.0.2
|
8 |
+
attrs==23.1.0
|
9 |
+
backoff==2.2.1
|
10 |
+
blinker==1.6.2
|
11 |
+
cachetools==5.3.1
|
12 |
+
certifi==2023.5.7
|
13 |
+
cffi==1.15.1
|
14 |
+
chardet==5.1.0
|
15 |
+
charset-normalizer==3.1.0
|
16 |
+
chromadb==0.3.26
|
17 |
+
click==8.1.3
|
18 |
+
clickhouse-connect==0.6.2
|
19 |
+
coloredlogs==15.0.1
|
20 |
+
commonmark==0.9.1
|
21 |
+
contourpy==1.0.7
|
22 |
+
cryptography==41.0.1
|
23 |
+
cycler==0.11.0
|
24 |
+
Cython==0.29.35
|
25 |
+
dataclasses-json==0.5.8
|
26 |
+
decorator==5.1.1
|
27 |
+
Deprecated==1.2.14
|
28 |
+
duckdb==0.8.1
|
29 |
+
effdet==0.4.1
|
30 |
+
et-xmlfile==1.1.0
|
31 |
+
exceptiongroup==1.1.1
|
32 |
+
faiss-cpu==1.7.4
|
33 |
+
fastapi==0.97.0
|
34 |
+
filelock==3.12.2
|
35 |
+
filetype==1.2.0
|
36 |
+
flatbuffers==23.5.26
|
37 |
+
fonttools==4.40.0
|
38 |
+
frozenlist==1.3.3
|
39 |
+
fsspec==2023.6.0
|
40 |
+
gitdb==4.0.10
|
41 |
+
GitPython==3.1.31
|
42 |
+
greenlet==2.0.2
|
43 |
+
h11==0.14.0
|
44 |
+
hnswlib==0.7.0
|
45 |
+
httpcore==0.16.3
|
46 |
+
httptools==0.5.0
|
47 |
+
httpx==0.23.3
|
48 |
+
huggingface-hub==0.15.1
|
49 |
+
humanfriendly==10.0
|
50 |
+
idna==3.4
|
51 |
+
importlib-metadata==6.6.0
|
52 |
+
iopath==0.1.10
|
53 |
+
Jinja2==3.1.2
|
54 |
+
joblib==1.2.0
|
55 |
+
jsonschema==4.17.3
|
56 |
+
kiwisolver==1.4.4
|
57 |
+
langchain==0.0.198
|
58 |
+
langchainplus-sdk==0.0.9
|
59 |
+
layoutparser==0.3.4
|
60 |
+
lxml==4.9.2
|
61 |
+
lz4==4.3.2
|
62 |
+
Markdown==3.4.3
|
63 |
+
MarkupSafe==2.1.3
|
64 |
+
marshmallow==3.19.0
|
65 |
+
marshmallow-enum==1.5.1
|
66 |
+
matplotlib==3.7.1
|
67 |
+
monotonic==1.6
|
68 |
+
mpmath==1.3.0
|
69 |
+
msg-parser==1.2.0
|
70 |
+
multidict==6.0.4
|
71 |
+
mypy-extensions==1.0.0
|
72 |
+
networkx==3.1
|
73 |
+
nltk==3.8.1
|
74 |
+
numexpr==2.8.4
|
75 |
+
numpy==1.23.5
|
76 |
+
olefile==0.46
|
77 |
+
omegaconf==2.3.0
|
78 |
+
onnxruntime==1.15.0
|
79 |
+
openapi-schema-pydantic==1.2.4
|
80 |
+
opencv-python==4.7.0.72
|
81 |
+
openpyxl==3.1.2
|
82 |
+
overrides==7.3.1
|
83 |
+
packaging==23.1
|
84 |
+
pandas==1.5.3
|
85 |
+
pdf2image==1.16.3
|
86 |
+
pdfminer.six==20221105
|
87 |
+
pdfplumber==0.9.0
|
88 |
+
Pillow==9.5.0
|
89 |
+
portalocker==2.7.0
|
90 |
+
posthog==3.0.1
|
91 |
+
protobuf==4.23.2
|
92 |
+
pulsar-client==3.2.0
|
93 |
+
pyarrow==12.0.1
|
94 |
+
pycocotools==2.0.6
|
95 |
+
pycparser==2.21
|
96 |
+
pydantic==1.10.9
|
97 |
+
pydeck==0.8.1b0
|
98 |
+
Pygments==2.15.1
|
99 |
+
Pympler==1.0.1
|
100 |
+
pypandoc==1.11
|
101 |
+
pyparsing==3.0.9
|
102 |
+
pyrsistent==0.19.3
|
103 |
+
pytesseract==0.3.10
|
104 |
+
python-dateutil==2.8.2
|
105 |
+
python-docx==0.8.11
|
106 |
+
python-dotenv==1.0.0
|
107 |
+
python-magic==0.4.27
|
108 |
+
python-multipart==0.0.6
|
109 |
+
python-pptx==0.6.21
|
110 |
+
pytz==2023.3
|
111 |
+
pytz-deprecation-shim==0.1.0.post0
|
112 |
+
PyYAML==6.0
|
113 |
+
regex==2023.6.3
|
114 |
+
requests==2.31.0
|
115 |
+
rfc3986==1.5.0
|
116 |
+
rich==13.0.1
|
117 |
+
safetensors==0.3.1
|
118 |
+
scikit-learn==1.2.2
|
119 |
+
scipy==1.10.1
|
120 |
+
sentence-transformers==2.2.2
|
121 |
+
sentencepiece==0.1.99
|
122 |
+
six==1.16.0
|
123 |
+
smmap==5.0.0
|
124 |
+
sniffio==1.3.0
|
125 |
+
SQLAlchemy==2.0.16
|
126 |
+
starlette==0.27.0
|
127 |
+
streamlit==1.23.1
|
128 |
+
sympy==1.12
|
129 |
+
tabulate==0.9.0
|
130 |
+
tenacity==8.2.2
|
131 |
+
threadpoolctl==3.1.0
|
132 |
+
tiktoken==0.4.0
|
133 |
+
timm==0.9.2
|
134 |
+
tokenizers==0.13.3
|
135 |
+
toml==0.10.2
|
136 |
+
toolz==0.12.0
|
137 |
+
torch==2.0.1
|
138 |
+
torchvision==0.15.2
|
139 |
+
tornado==6.3.2
|
140 |
+
tqdm==4.65.0
|
141 |
+
transformers==4.30.1
|
142 |
+
typer==0.9.0
|
143 |
+
typing-inspect==0.9.0
|
144 |
+
typing_extensions==4.6.3
|
145 |
+
tzdata==2023.3
|
146 |
+
tzlocal==4.3
|
147 |
+
unstructured==0.7.4
|
148 |
+
unstructured-inference==0.5.1
|
149 |
+
urllib3==2.0.3
|
150 |
+
uvicorn==0.22.0
|
151 |
+
uvloop==0.17.0
|
152 |
+
validators==0.20.0
|
153 |
+
Wand==0.6.11
|
154 |
+
watchfiles==0.19.0
|
155 |
+
websockets==11.0.3
|
156 |
+
wrapt==1.14.1
|
157 |
+
xlrd==2.0.1
|
158 |
+
XlsxWriter==3.1.2
|
159 |
+
yarl==1.9.2
|
160 |
+
zipp==3.15.0
|
161 |
+
zstandard==0.21.0
|