PCFISH commited on
Commit
ab69028
β€’
1 Parent(s): 2945b13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -60
app.py CHANGED
@@ -11,6 +11,7 @@ from langchain.chains import ConversationalRetrievalChain
11
  from htmlTemplates import css, bot_template, user_template
12
  from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers # For loading transformer models.
13
  from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
 
14
  import tempfile # μž„μ‹œ νŒŒμΌμ„ μƒμ„±ν•˜κΈ° μœ„ν•œ λΌμ΄λΈŒλŸ¬λ¦¬μž…λ‹ˆλ‹€.
15
  import os
16
 
@@ -30,50 +31,39 @@ def get_pdf_text(pdf_docs):
30
 
31
  def get_text_file(docs):
32
  text_list = []
33
- for file in docs:
34
- if file.type == 'text/plain':
35
- # file is .txt
36
- text_list.append(file.getvalue().decode('utf-8'))
37
  return text_list
38
 
39
  def get_csv_file(docs):
 
40
  csv_list = []
41
- for file in docs:
42
- if file.type == 'text/csv':
43
- # file is .csv
44
- csv_list.extend(csv.reader(file.getvalue().decode('utf-8').splitlines()))
 
45
  return csv_list
46
 
47
  def get_json_file(docs):
48
  json_list = []
49
- for file in docs:
50
- if file.type == 'application/json':
51
- # file is .json
52
- json_list.extend(json.load(file))
53
  return json_list
54
 
55
 
56
  # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
57
  def get_text_chunks(documents):
58
  text_splitter = RecursiveCharacterTextSplitter(
59
- chunk_size=1000,
60
- chunk_overlap=200,
61
- length_function=len
62
  )
63
 
64
- # 각 λ¬Έμ„œμ˜ λ‚΄μš©μ„ λ¦¬μŠ€νŠΈμ— μΆ”κ°€
65
- texts = []
66
- for doc in documents:
67
- if hasattr(doc, 'page_content'):
68
- # λ¬Έμ„œ 객체인 κ²½μš°μ—λ§Œ μΆ”κ°€
69
- texts.append(doc.page_content)
70
- elif isinstance(doc, str):
71
- # λ¬Έμžμ—΄μΈ 경우 κ·ΈλŒ€λ‘œ μΆ”κ°€
72
- texts.append(doc)
73
-
74
- # λ‚˜λˆˆ 청크λ₯Ό λ°˜ν™˜
75
- return text_splitter.split_documents(texts)
76
-
77
 
78
 
79
  # ν…μŠ€νŠΈ μ²­ν¬λ“€λ‘œλΆ€ν„° 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
@@ -87,30 +77,19 @@ def get_vectorstore(text_chunks):
87
 
88
 
89
  def get_conversation_chain(vectorstore):
90
- print(f"DEBUG: session_state.conversation before initialization: {st.session_state.conversation}")
91
-
92
- try:
93
- if st.session_state.conversation is None:
94
- gpt_model_name = 'gpt-3.5-turbo'
95
- llm = ChatOpenAI(model_name=gpt_model_name)
96
-
97
- # λŒ€ν™” 기둝을 μ €μž₯ν•˜κΈ° μœ„ν•œ λ©”λͺ¨λ¦¬λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
98
- memory = ConversationBufferMemory(
99
- memory_key='chat_history', return_messages=True)
100
- # λŒ€ν™” 검색 체인을 μƒμ„±ν•©λ‹ˆλ‹€.
101
- conversation_chain = ConversationalRetrievalChain.from_llm(
102
- llm=llm,
103
- retriever=vectorstore.as_retriever(),
104
- memory=memory
105
- )
106
- st.session_state.conversation = conversation_chain
107
-
108
- except Exception as e:
109
- print(f"Error during conversation initialization: {e}")
110
-
111
- print(f"DEBUG: session_state.conversation after initialization: {st.session_state.conversation}")
112
-
113
- return st.session_state.conversation if st.session_state.conversation else ConversationalRetrievalChain()
114
 
115
  # μ‚¬μš©μž μž…λ ₯을 μ²˜λ¦¬ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
116
  def handle_userinput(user_question):
@@ -130,12 +109,13 @@ def handle_userinput(user_question):
130
 
131
  def main():
132
  load_dotenv()
133
- st.set_page_config(page_title="Chat with multiple Files :)",
134
  page_icon=":books:")
135
  st.write(css, unsafe_allow_html=True)
136
 
137
- if "conversation" not in st.session_state or st.session_state.conversation is None:
138
  st.session_state.conversation = None
 
139
  st.session_state.chat_history = None
140
 
141
  st.header("Chat with multiple Files :")
@@ -150,7 +130,7 @@ def main():
150
 
151
  st.subheader("Your documents")
152
  docs = st.file_uploader(
153
- "Upload your documents here and click on 'Process'", accept_multiple_files=True)
154
  if st.button("Process"):
155
  with st.spinner("Processing"):
156
  # get pdf text
@@ -160,16 +140,16 @@ def main():
160
  print('file - type : ', file.type)
161
  if file.type == 'text/plain':
162
  # file is .txt
163
- doc_list.extend(get_text_file([file]))
164
  elif file.type in ['application/octet-stream', 'application/pdf']:
165
  # file is .pdf
166
  doc_list.extend(get_pdf_text(file))
167
  elif file.type == 'text/csv':
168
  # file is .csv
169
- doc_list.extend(get_csv_file([file]))
170
  elif file.type == 'application/json':
171
  # file is .json
172
- doc_list.extend(get_json_file([file]))
173
 
174
  # get the text chunks
175
  text_chunks = get_text_chunks(doc_list)
@@ -178,8 +158,9 @@ def main():
178
  vectorstore = get_vectorstore(text_chunks)
179
 
180
  # create conversation chain
181
- st.session_state.conversation = get_conversation_chain(vectorstore)
 
182
 
183
 
184
  if __name__ == '__main__':
185
- main()
 
11
  from htmlTemplates import css, bot_template, user_template
12
  from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers # For loading transformer models.
13
  from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
14
+ from io import TextIOWrapper
15
  import tempfile # μž„μ‹œ νŒŒμΌμ„ μƒμ„±ν•˜κΈ° μœ„ν•œ λΌμ΄λΈŒλŸ¬λ¦¬μž…λ‹ˆλ‹€.
16
  import os
17
 
 
31
 
32
  def get_text_file(docs):
33
  text_list = []
34
+ with TextIOWrapper(docs, encoding='utf-8') as f:
35
+ text_list.append(f.read())
 
 
36
  return text_list
37
 
38
  def get_csv_file(docs):
39
+ # For .csv files
40
  csv_list = []
41
+ csv_data = docs.getvalue().decode('utf-8')
42
+ for row in csv_data.split('\n')[1:]:
43
+ columns = row.split(',')
44
+ text = columns[1]
45
+ csv_list.append(text)
46
  return csv_list
47
 
48
  def get_json_file(docs):
49
  json_list = []
50
+ json_data = docs.getvalue().decode('utf-8')
51
+ for obj in json.loads(json_data):
52
+ text = obj.get('text', '')
53
+ json_list.append(text)
54
  return json_list
55
 
56
 
57
  # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
58
  def get_text_chunks(documents):
59
  text_splitter = RecursiveCharacterTextSplitter(
60
+ chunk_size=1000, # 청크의 크기λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
61
+ chunk_overlap=200, # 청크 μ‚¬μ΄μ˜ 쀑볡을 μ§€μ •ν•©λ‹ˆλ‹€.
62
+ length_function=len # ν…μŠ€νŠΈμ˜ 길이λ₯Ό μΈ‘μ •ν•˜λŠ” ν•¨μˆ˜λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
63
  )
64
 
65
+ documents = text_splitter.split_documents(documents) # λ¬Έμ„œλ“€μ„ 청크둜 λ‚˜λˆ•λ‹ˆλ‹€
66
+ return documents # λ‚˜λˆˆ 청크λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
 
 
 
 
 
 
 
 
 
 
 
67
 
68
 
69
  # ν…μŠ€νŠΈ μ²­ν¬λ“€λ‘œλΆ€ν„° 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
 
77
 
78
 
79
  def get_conversation_chain(vectorstore):
80
+ gpt_model_name = 'gpt-3.5-turbo'
81
+ llm = ChatOpenAI(model_name = gpt_model_name) #gpt-3.5 λͺ¨λΈ λ‘œλ“œ
82
+
83
+ # λŒ€ν™” 기둝을 μ €μž₯ν•˜κΈ° μœ„ν•œ λ©”λͺ¨λ¦¬λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
84
+ memory = ConversationBufferMemory(
85
+ memory_key='chat_history', return_messages=True)
86
+ # λŒ€ν™” 검색 체인을 μƒμ„±ν•©λ‹ˆλ‹€.
87
+ conversation_chain = ConversationalRetrievalChain.from_llm(
88
+ llm=llm,
89
+ retriever=vectorstore.as_retriever(),
90
+ memory=memory
91
+ )
92
+ return conversation_chain
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  # μ‚¬μš©μž μž…λ ₯을 μ²˜λ¦¬ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
95
  def handle_userinput(user_question):
 
109
 
110
  def main():
111
  load_dotenv()
112
+ st.set_page_config(page_title="Chat with multiple Files",
113
  page_icon=":books:")
114
  st.write(css, unsafe_allow_html=True)
115
 
116
+ if "conversation" not in st.session_state:
117
  st.session_state.conversation = None
118
+ if "chat_history" not in st.session_state:
119
  st.session_state.chat_history = None
120
 
121
  st.header("Chat with multiple Files :")
 
130
 
131
  st.subheader("Your documents")
132
  docs = st.file_uploader(
133
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
134
  if st.button("Process"):
135
  with st.spinner("Processing"):
136
  # get pdf text
 
140
  print('file - type : ', file.type)
141
  if file.type == 'text/plain':
142
  # file is .txt
143
+ doc_list.extend(get_text_file(file))
144
  elif file.type in ['application/octet-stream', 'application/pdf']:
145
  # file is .pdf
146
  doc_list.extend(get_pdf_text(file))
147
  elif file.type == 'text/csv':
148
  # file is .csv
149
+ doc_list.extend(get_csv_file(file))
150
  elif file.type == 'application/json':
151
  # file is .json
152
+ doc_list.extend(get_json_file(file))
153
 
154
  # get the text chunks
155
  text_chunks = get_text_chunks(doc_list)
 
158
  vectorstore = get_vectorstore(text_chunks)
159
 
160
  # create conversation chain
161
+ st.session_state.conversation = get_conversation_chain(
162
+ vectorstore)
163
 
164
 
165
  if __name__ == '__main__':
166
+ main()