jarif commited on
Commit
b59c943
1 Parent(s): 4f64ed1

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +59 -0
  3. docs/Ali_Md_Monsur_Masterthesis.pdf +3 -0
  4. ingest.py +30 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ docs/Ali_Md_Monsur_Masterthesis.pdf filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain.llms import HuggingFacePipeline
6
+ from langchain.chains import RetrievalQA
7
+
8
+ checkpoint = "LaMini-T5-738M"
9
+
10
+ @st.cache_resource
11
+ def load_llm():
12
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
13
+ model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
14
+ pipe = pipeline(
15
+ 'text2text-generation',
16
+ model=model,
17
+ tokenizer=tokenizer,
18
+ max_length=256,
19
+ do_sample=True,
20
+ temperature=0.3,
21
+ top_p=0.95
22
+ )
23
+ return HuggingFacePipeline(pipeline=pipe)
24
+
25
+ @st.cache_resource
26
+ def qa_llm():
27
+ llm = load_llm()
28
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
29
+ db = FAISS.load_local("faiss_index", embeddings)
30
+ retriever = db.as_retriever()
31
+ qa = RetrievalQA.from_chain_type(
32
+ llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
33
+ return qa
34
+
35
+ def process_answer(instruction):
36
+ qa = qa_llm()
37
+ generated_text = qa(instruction)
38
+ answer = generated_text['result']
39
+ return answer, generated_text
40
+
41
+ def main():
42
+ st.title("Search Your PDF 🐦📄")
43
+ with st.expander("About the App"):
44
+ st.markdown(
45
+ """
46
+ This is a Generative AI powered Question and Answering app that responds to questions about your PDF File.
47
+ """
48
+ )
49
+ question = st.text_area("Enter your Question")
50
+ if st.button("Ask"):
51
+ st.info("Your Question: " + question)
52
+
53
+ st.info("Your Answer")
54
+ answer, metadata = process_answer(question)
55
+ st.write(answer)
56
+ st.write(metadata)
57
+
58
+ if __name__ == '__main__':
59
+ main()
docs/Ali_Md_Monsur_Masterthesis.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45d4815c2923a3b8a116976dcf225ae21a670d83ac7d9895bfe418ca6343160d
3
+ size 3059014
ingest.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader
5
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
+
8
+ def main():
9
+ documents = []
10
+ for root, dirs, files in os.walk("docs"):
11
+ for file in files:
12
+ if file.endswith(".pdf"):
13
+ print(file)
14
+ loader = PDFMinerLoader(os.path.join(root, file))
15
+ documents = loader.load()
16
+
17
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=500)
18
+ texts = text_splitter.split_documents(documents)
19
+
20
+ # Create embeddings
21
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
22
+
23
+ # Create FAISS index
24
+ db = FAISS.from_documents(texts, embeddings)
25
+
26
+ # Save the index
27
+ db.save_local("faiss_index")
28
+
29
+ if __name__ == "__main__":
30
+ main()