SumayyaAli commited on
Commit
63337f5
1 Parent(s): 1cfa27e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.document_loaders import PyPDFLoader
4
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
5
+ from transformers import pipeline
6
+ import torch
7
+ import base64
8
+
9
+
10
+ checkpoint = "LaMini-Flan-T5-248M"
11
+ #model and tokenizer loading
12
+
13
+ tokenizer = T5Tokenizer.from_pretrained(checkpoint)
14
+ base_model = T5ForConditionalGeneration.from_pretrained(checkpoint, device_map='auto', torch_dtype=torch.float32, offload_folder='D:/project/offload')
15
+
16
+ #file loader and preprocessing
17
+ def file_preprocessing(file):
18
+ loader = PyPDFLoader(file)
19
+ pages = loader.load_and_split()
20
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
21
+ texts = text_splitter.split_documents(pages)
22
+ final_texts = ""
23
+ for text in texts:
24
+ print(text)
25
+ final_texts = final_texts + text.page_content
26
+ return final_texts
27
+
28
+ #LLM pipeline
29
+ def llm_pipeline(filepath):
30
+ pipe_sum = pipeline(
31
+ 'summarization',
32
+ model = base_model,
33
+ tokenizer = tokenizer,
34
+ max_length = 500,
35
+ min_length = 50)
36
+ input_text = file_preprocessing(filepath)
37
+ result = pipe_sum(input_text)
38
+ result = result[0]['summary_text']
39
+ return result
40
+
41
+ @st.cache_data
42
+ #function to display the PDF of a given file
43
+ def displayPDF(file):
44
+ # Opening file from file path
45
+ with open(file, "rb") as f:
46
+ base64_pdf = base64.b64encode(f.read()).decode('utf-8')
47
+
48
+ # Embedding PDF in HTML
49
+ pdf_display = F'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
50
+
51
+ # Displaying File
52
+ st.markdown(pdf_display, unsafe_allow_html=True)
53
+
54
+ #streamlit code
55
+ st.set_page_config(layout="wide")
56
+
57
+ def main():
58
+ st.title("Document Summarization App using Language Model")
59
+
60
+ uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
61
+
62
+ if uploaded_file is not None:
63
+ if st.button("Summarize"):
64
+ col1, col2 = st.columns(2)
65
+ filepath = "data/"+uploaded_file.name
66
+ with open(filepath, "wb") as temp_file:
67
+ temp_file.write(uploaded_file.read())
68
+ with col1:
69
+ st.info("Uploaded File")
70
+ pdf_view = displayPDF(filepath)
71
+
72
+ with col2:
73
+ summary = llm_pipeline(filepath)
74
+ st.info("Summarization Complete")
75
+ st.success(summary)
76
+
77
+
78
+
79
+ if __name__ == "__main__":
80
+ main()