anpigon commited on
Commit
9c8a6b2
1 Parent(s): 3a8ae74

feat: Update app.py with DataFrameLoader and LongContextReorder

Browse files
Files changed (1) hide show
  1. app.py +21 -7
app.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import torch
3
 
4
  import gradio as gr
 
5
  from dotenv import load_dotenv
6
  from langchain.callbacks.base import BaseCallbackHandler
7
  from langchain.embeddings import CacheBackedEmbeddings
@@ -10,7 +11,11 @@ from langchain.retrievers import EnsembleRetriever
10
  from langchain.storage import LocalFileStore
11
  from langchain_anthropic import ChatAnthropic
12
  from langchain_community.chat_models import ChatOllama
13
- from langchain_community.document_loaders import NotebookLoader, TextLoader
 
 
 
 
14
  from langchain_community.document_loaders.generic import GenericLoader
15
  from langchain_community.document_loaders.parsers.language.language_parser import (
16
  LanguageParser,
@@ -21,7 +26,11 @@ from langchain_core.callbacks.manager import CallbackManager
21
  from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
22
  from langchain_core.output_parsers import StrOutputParser
23
  from langchain_core.prompts import PromptTemplate
24
- from langchain_core.runnables import ConfigurableField, RunnablePassthrough
 
 
 
 
25
  from langchain_google_genai import GoogleGenerativeAI
26
  from langchain_groq import ChatGroq
27
  from langchain_openai import ChatOpenAI, OpenAIEmbeddings
@@ -29,6 +38,8 @@ from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
29
 
30
  from langchain_cohere import CohereRerank
31
  from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
 
 
32
 
33
  # Load environment variables
34
  load_dotenv()
@@ -87,10 +98,9 @@ print(f".ipynb 파일의 개수: {len(ipynb_documents)}")
87
 
88
 
89
  ## wikidocs
90
- import pandas as pd
91
-
92
- df = pd.read_parquet("wikidocs_14314.parquet")
93
- wiki_documents = text_splitter.split_documents(loader.load())
94
 
95
 
96
  # Split documents into chunks
@@ -257,7 +267,11 @@ llm = ChatOpenAI(
257
 
258
  # Create retrieval-augmented generation chain
259
  rag_chain = (
260
- {"context": compression_retriever, "question": RunnablePassthrough()}
 
 
 
 
261
  | prompt
262
  | llm
263
  | StrOutputParser()
 
2
  import torch
3
 
4
  import gradio as gr
5
+ import pandas as pd
6
  from dotenv import load_dotenv
7
  from langchain.callbacks.base import BaseCallbackHandler
8
  from langchain.embeddings import CacheBackedEmbeddings
 
11
  from langchain.storage import LocalFileStore
12
  from langchain_anthropic import ChatAnthropic
13
  from langchain_community.chat_models import ChatOllama
14
+ from langchain_community.document_loaders import (
15
+ NotebookLoader,
16
+ TextLoader,
17
+ DataFrameLoader,
18
+ )
19
  from langchain_community.document_loaders.generic import GenericLoader
20
  from langchain_community.document_loaders.parsers.language.language_parser import (
21
  LanguageParser,
 
26
  from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
27
  from langchain_core.output_parsers import StrOutputParser
28
  from langchain_core.prompts import PromptTemplate
29
+ from langchain_core.runnables import (
30
+ ConfigurableField,
31
+ RunnablePassthrough,
32
+ RunnableLambda,
33
+ )
34
  from langchain_google_genai import GoogleGenerativeAI
35
  from langchain_groq import ChatGroq
36
  from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 
38
 
39
  from langchain_cohere import CohereRerank
40
  from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
41
+ from langchain_community.document_transformers import LongContextReorder
42
+
43
 
44
  # Load environment variables
45
  load_dotenv()
 
98
 
99
 
100
  ## wikidocs
101
+ df = pd.read_parquet("./docs/wikidocs_14314.parquet")
102
+ loader = DataFrameLoader(df, page_content_column="content")
103
+ wiki_documents = loader.load()
 
104
 
105
 
106
  # Split documents into chunks
 
267
 
268
  # Create retrieval-augmented generation chain
269
  rag_chain = (
270
+ {
271
+ "context": compression_retriever
272
+ | RunnableLambda(LongContextReorder().transform_documents),
273
+ "question": RunnablePassthrough(),
274
+ }
275
  | prompt
276
  | llm
277
  | StrOutputParser()