PCFISH commited on
Commit
f3a69bb
β€’
1 Parent(s): fb78073

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -27
app.py CHANGED
@@ -11,7 +11,6 @@ from langchain.chains import ConversationalRetrievalChain
11
  from htmlTemplates import css, bot_template, user_template
12
  from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers # For loading transformer models.
13
  from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
14
- from io import TextIOWrapper
15
  import tempfile # μž„μ‹œ νŒŒμΌμ„ μƒμ„±ν•˜κΈ° μœ„ν•œ λΌμ΄λΈŒλŸ¬λ¦¬μž…λ‹ˆλ‹€.
16
  import os
17
 
@@ -57,33 +56,13 @@ def get_json_file(docs):
57
  # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
58
  def get_text_chunks(documents):
59
  text_splitter = RecursiveCharacterTextSplitter(
60
- chunk_size=1000,
61
- chunk_overlap=200,
62
- length_function=len
63
  )
64
 
65
- text_chunks = []
66
-
67
- for doc in documents:
68
- if isinstance(doc, str):
69
- # If the document is a string, treat it as plain text
70
- text_chunks.append(doc)
71
- elif hasattr(doc, 'page_content'):
72
- # If the document has a 'page_content' attribute, use it
73
- text_chunks.append(doc.page_content)
74
- else:
75
- # Handle other types of documents as needed
76
- # For example, if it's a list of strings, concatenate them
77
- if isinstance(doc, list) and all(isinstance(item, str) for item in doc):
78
- text_chunks.append(' '.join(doc))
79
- else:
80
- # Handle other cases based on the actual structure of your documents
81
- raise ValueError(f"Unsupported document type: {type(doc)}")
82
-
83
- # Split the text chunks
84
- text_chunks = text_splitter.split_documents(text_chunks)
85
-
86
- return text_chunks
87
 
88
 
89
  # ν…μŠ€νŠΈ μ²­ν¬λ“€λ‘œλΆ€ν„° 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
@@ -183,4 +162,4 @@ def main():
183
 
184
 
185
  if __name__ == '__main__':
186
- main()
 
11
  from htmlTemplates import css, bot_template, user_template
12
  from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers # For loading transformer models.
13
  from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
 
14
  import tempfile # μž„μ‹œ νŒŒμΌμ„ μƒμ„±ν•˜κΈ° μœ„ν•œ λΌμ΄λΈŒλŸ¬λ¦¬μž…λ‹ˆλ‹€.
15
  import os
16
 
 
56
  # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
57
  def get_text_chunks(documents):
58
  text_splitter = RecursiveCharacterTextSplitter(
59
+ chunk_size=1000, # 청크의 크기λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
60
+ chunk_overlap=200, # 청크 μ‚¬μ΄μ˜ 쀑볡을 μ§€μ •ν•©λ‹ˆλ‹€.
61
+ length_function=len # ν…μŠ€νŠΈμ˜ 길이λ₯Ό μΈ‘μ •ν•˜λŠ” ν•¨μˆ˜λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
62
  )
63
 
64
+ documents = text_splitter.split_documents(documents) # λ¬Έμ„œλ“€μ„ 청크둜 λ‚˜λˆ•λ‹ˆλ‹€
65
+ return documents # λ‚˜λˆˆ 청크λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
 
68
  # ν…μŠ€νŠΈ μ²­ν¬λ“€λ‘œλΆ€ν„° 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
 
162
 
163
 
164
  if __name__ == '__main__':
165
+ main()