anpigon commited on
Commit
758156d
β€’
1 Parent(s): ea24c65

feat: Split wikidocs into chunks and add to combined documents

Browse files
Files changed (1) hide show
  1. app.py +25 -14
app.py CHANGED
@@ -86,6 +86,13 @@ for dirpath, _, filenames in os.walk(repo_root_dir):
86
  print(f".ipynb 파일의 개수: {len(ipynb_documents)}")
87
 
88
 
 
 
 
 
 
 
 
89
  # Split documents into chunks
90
  def split_documents(documents, language, chunk_size=2000, chunk_overlap=200):
91
  splitter = RecursiveCharacterTextSplitter.from_language(
@@ -97,12 +104,14 @@ def split_documents(documents, language, chunk_size=2000, chunk_overlap=200):
97
  py_docs = split_documents(py_documents, Language.PYTHON)
98
  mdx_docs = split_documents(mdx_documents, Language.MARKDOWN)
99
  ipynb_docs = split_documents(ipynb_documents, Language.PYTHON)
 
100
 
101
- print(f"λΆ„ν• λœ .py 파일의 개수: {len(py_docs)}")
102
- print(f"λΆ„ν• λœ .mdx 파일의 개수: {len(mdx_docs)}")
103
- print(f"λΆ„ν• λœ .ipynb 파일의 개수: {len(ipynb_docs)}")
 
104
 
105
- combined_documents = py_docs + mdx_docs + ipynb_docs
106
  print(f"총 λ„νλ¨ΌνŠΈ 개수: {len(combined_documents)}")
107
 
108
 
@@ -132,19 +141,21 @@ cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
132
 
133
  # Create and save FAISS index
134
  FAISS_DB_INDEX = "./langchain_faiss"
135
- # faiss_db = FAISS.from_documents(
136
- # documents=combined_documents,
137
- # embedding=cached_embeddings,
138
- # )
139
- # faiss_db.save_local(folder_path=FAISS_DB_INDEX)
 
140
 
141
  # Create and save Chroma index
142
  CHROMA_DB_INDEX = "./langchain_chroma"
143
- # chroma_db = Chroma.from_documents(
144
- # documents=combined_documents,
145
- # embedding=cached_embeddings,
146
- # persist_directory=CHROMA_DB_INDEX,
147
- # )
 
148
 
149
  # load vectorstore
150
  faiss_db = FAISS.load_local(
 
86
  print(f".ipynb 파일의 개수: {len(ipynb_documents)}")
87
 
88
 
89
+ ## wikidocs
90
+ import pandas as pd
91
+
92
+ df = pd.read_parquet("wikidocs_14314.parquet")
93
+ wiki_documents = text_splitter.split_documents(loader.load())
94
+
95
+
96
  # Split documents into chunks
97
  def split_documents(documents, language, chunk_size=2000, chunk_overlap=200):
98
  splitter = RecursiveCharacterTextSplitter.from_language(
 
104
  py_docs = split_documents(py_documents, Language.PYTHON)
105
  mdx_docs = split_documents(mdx_documents, Language.MARKDOWN)
106
  ipynb_docs = split_documents(ipynb_documents, Language.PYTHON)
107
+ wiki_docs = split_documents(wiki_documents, Language.MARKDOWN)
108
 
109
+ print(f"λΆ„ν• λœ .py λ¬Έμ„œμ˜ 개수: {len(py_docs)}")
110
+ print(f"λΆ„ν• λœ .mdx λ¬Έμ„œμ˜ 개수: {len(mdx_docs)}")
111
+ print(f"λΆ„ν• λœ .ipynb λ¬Έμ„œμ˜ 개수: {len(ipynb_docs)}")
112
+ print(f"λΆ„ν• λœ .(wiki λ¬Έμ„œμ˜ 개수: {len(wiki_docs)}")
113
 
114
+ combined_documents = py_docs + mdx_docs + ipynb_docs + wiki_docs
115
  print(f"총 λ„νλ¨ΌνŠΈ 개수: {len(combined_documents)}")
116
 
117
 
 
141
 
142
  # Create and save FAISS index
143
  FAISS_DB_INDEX = "./langchain_faiss"
144
+ if not os.path.exists(FAISS_DB_INDEX):
145
+ faiss_db = FAISS.from_documents(
146
+ documents=combined_documents,
147
+ embedding=cached_embeddings,
148
+ )
149
+ faiss_db.save_local(folder_path=FAISS_DB_INDEX)
150
 
151
  # Create and save Chroma index
152
  CHROMA_DB_INDEX = "./langchain_chroma"
153
+ if not os.path.exists(CHROMA_DB_INDEX):
154
+ chroma_db = Chroma.from_documents(
155
+ documents=combined_documents,
156
+ embedding=cached_embeddings,
157
+ persist_directory=CHROMA_DB_INDEX,
158
+ )
159
 
160
  # load vectorstore
161
  faiss_db = FAISS.load_local(