PCFISH commited on
Commit
fb78073
β€’
1 Parent(s): ab69028

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -5
app.py CHANGED
@@ -57,13 +57,33 @@ def get_json_file(docs):
57
  # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
58
  def get_text_chunks(documents):
59
  text_splitter = RecursiveCharacterTextSplitter(
60
- chunk_size=1000, # 청크의 크기λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
61
- chunk_overlap=200, # 청크 μ‚¬μ΄μ˜ 쀑볡을 μ§€μ •ν•©λ‹ˆλ‹€.
62
- length_function=len # ν…μŠ€νŠΈμ˜ 길이λ₯Ό μΈ‘μ •ν•˜λŠ” ν•¨μˆ˜λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
63
  )
64
 
65
- documents = text_splitter.split_documents(documents) # λ¬Έμ„œλ“€μ„ 청크둜 λ‚˜λˆ•λ‹ˆλ‹€
66
- return documents # λ‚˜λˆˆ 청크λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
 
69
  # ν…μŠ€νŠΈ μ²­ν¬λ“€λ‘œλΆ€ν„° 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
 
57
  # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
58
  def get_text_chunks(documents):
59
  text_splitter = RecursiveCharacterTextSplitter(
60
+ chunk_size=1000,
61
+ chunk_overlap=200,
62
+ length_function=len
63
  )
64
 
65
+ text_chunks = []
66
+
67
+ for doc in documents:
68
+ if isinstance(doc, str):
69
+ # If the document is a string, treat it as plain text
70
+ text_chunks.append(doc)
71
+ elif hasattr(doc, 'page_content'):
72
+ # If the document has a 'page_content' attribute, use it
73
+ text_chunks.append(doc.page_content)
74
+ else:
75
+ # Handle other types of documents as needed
76
+ # For example, if it's a list of strings, concatenate them
77
+ if isinstance(doc, list) and all(isinstance(item, str) for item in doc):
78
+ text_chunks.append(' '.join(doc))
79
+ else:
80
+ # Handle other cases based on the actual structure of your documents
81
+ raise ValueError(f"Unsupported document type: {type(doc)}")
82
+
83
+ # Split the text chunks
84
+ text_chunks = text_splitter.split_documents(text_chunks)
85
+
86
+ return text_chunks
87
 
88
 
89
  # ν…μŠ€νŠΈ μ²­ν¬λ“€λ‘œλΆ€ν„° 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.