Spaces:
Sleeping
Sleeping
feat: Split wikidocs into chunks and add to combined documents
Browse files
app.py
CHANGED
@@ -86,6 +86,13 @@ for dirpath, _, filenames in os.walk(repo_root_dir):
|
|
86 |
print(f".ipynb νμΌμ κ°μ: {len(ipynb_documents)}")
|
87 |
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
# Split documents into chunks
|
90 |
def split_documents(documents, language, chunk_size=2000, chunk_overlap=200):
|
91 |
splitter = RecursiveCharacterTextSplitter.from_language(
|
@@ -97,12 +104,14 @@ def split_documents(documents, language, chunk_size=2000, chunk_overlap=200):
|
|
97 |
py_docs = split_documents(py_documents, Language.PYTHON)
|
98 |
mdx_docs = split_documents(mdx_documents, Language.MARKDOWN)
|
99 |
ipynb_docs = split_documents(ipynb_documents, Language.PYTHON)
|
|
|
100 |
|
101 |
-
print(f"λΆν λ .py
|
102 |
-
print(f"λΆν λ .mdx
|
103 |
-
print(f"λΆν λ .ipynb
|
|
|
104 |
|
105 |
-
combined_documents = py_docs + mdx_docs + ipynb_docs
|
106 |
print(f"μ΄ λνλ¨ΌνΈ κ°μ: {len(combined_documents)}")
|
107 |
|
108 |
|
@@ -132,19 +141,21 @@ cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
|
|
132 |
|
133 |
# Create and save FAISS index
|
134 |
FAISS_DB_INDEX = "./langchain_faiss"
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
140 |
|
141 |
# Create and save Chroma index
|
142 |
CHROMA_DB_INDEX = "./langchain_chroma"
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
|
|
148 |
|
149 |
# load vectorstore
|
150 |
faiss_db = FAISS.load_local(
|
|
|
86 |
print(f".ipynb νμΌμ κ°μ: {len(ipynb_documents)}")
|
87 |
|
88 |
|
89 |
+
## wikidocs
|
90 |
+
import pandas as pd
|
91 |
+
|
92 |
+
df = pd.read_parquet("wikidocs_14314.parquet")
|
93 |
+
wiki_documents = text_splitter.split_documents(loader.load())
|
94 |
+
|
95 |
+
|
96 |
# Split documents into chunks
|
97 |
def split_documents(documents, language, chunk_size=2000, chunk_overlap=200):
|
98 |
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
104 |
py_docs = split_documents(py_documents, Language.PYTHON)
|
105 |
mdx_docs = split_documents(mdx_documents, Language.MARKDOWN)
|
106 |
ipynb_docs = split_documents(ipynb_documents, Language.PYTHON)
|
107 |
+
wiki_docs = split_documents(wiki_documents, Language.MARKDOWN)
|
108 |
|
109 |
+
print(f"λΆν λ .py λ¬Έμμ κ°μ: {len(py_docs)}")
|
110 |
+
print(f"λΆν λ .mdx λ¬Έμμ κ°μ: {len(mdx_docs)}")
|
111 |
+
print(f"λΆν λ .ipynb λ¬Έμμ κ°μ: {len(ipynb_docs)}")
|
112 |
+
print(f"λΆν λ .(wiki λ¬Έμμ κ°μ: {len(wiki_docs)}")
|
113 |
|
114 |
+
combined_documents = py_docs + mdx_docs + ipynb_docs + wiki_docs
|
115 |
print(f"μ΄ λνλ¨ΌνΈ κ°μ: {len(combined_documents)}")
|
116 |
|
117 |
|
|
|
141 |
|
142 |
# Create and save FAISS index
|
143 |
FAISS_DB_INDEX = "./langchain_faiss"
|
144 |
+
if not os.path.exists(FAISS_DB_INDEX):
|
145 |
+
faiss_db = FAISS.from_documents(
|
146 |
+
documents=combined_documents,
|
147 |
+
embedding=cached_embeddings,
|
148 |
+
)
|
149 |
+
faiss_db.save_local(folder_path=FAISS_DB_INDEX)
|
150 |
|
151 |
# Create and save Chroma index
|
152 |
CHROMA_DB_INDEX = "./langchain_chroma"
|
153 |
+
if not os.path.exists(CHROMA_DB_INDEX):
|
154 |
+
chroma_db = Chroma.from_documents(
|
155 |
+
documents=combined_documents,
|
156 |
+
embedding=cached_embeddings,
|
157 |
+
persist_directory=CHROMA_DB_INDEX,
|
158 |
+
)
|
159 |
|
160 |
# load vectorstore
|
161 |
faiss_db = FAISS.load_local(
|