Spaces:

drift-ai
/

internal-document-qa

Runtime error

App Files Files Community

Vincent Claes commited on Sep 20, 2023

Commit

8b6eec6

•

1 Parent(s): 8c6aaca

first rty with verba - not a complete success

Browse files

Files changed (4) hide show

README.md +10 -1
import_data.py +74 -0
poetry.lock +0 -0
pyproject.toml +19 -0

README.md CHANGED Viewed

	@@ -1 +1,10 @@
1	- # ~~rag-weviate-verba~~

+# Ausy RAG Demo
+```bash
+poetry shell
+poetry install
+export OPENAI_API_KEY=<...>
+export VERBA_URL=<...>
+export VERBA_API_KEY=<...>
+verba start --model "gpt-3.5-turbo"
+```

import_data.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+import weaviate
+from llama_index import download_loader
+from llama_index.vector_stores import WeaviateVectorStore
+from llama_index import VectorStoreIndex, StorageContext
+from pathlib import Path
+import argparse
+def get_pdf_files(base_path, loader):
+    """
+    Get paths to all PDF files in a directory and its subdirectories.
+    Parameters:
+    - base_path (str): The path to the starting directory.
+    Returns:
+    - list of str: A list of paths to all PDF files found.
+    """
+    pdf_paths = []
+    # Check if the base path exists and is a directory
+    if not os.path.exists(base_path):
+        raise FileNotFoundError(f"The specified base path does not exist: {base_path}")
+    if not os.path.isdir(base_path):
+        raise NotADirectoryError(f"The specified base_path is not a directory: {base_path}")
+    # Loop through all directories and files starting from the base path
+    for root, dirs, files in os.walk(base_path):
+        for filename in files:
+            # If a file has a .pdf extension, add its path to the list
+            if filename.endswith('.pdf'):
+                pdf_file = loader.load_data(file=Path(root, filename))
+                pdf_paths.extend(pdf_file)
+    return pdf_paths
+def main(args):
+    PDFReader = download_loader("PDFReader")
+    loader = PDFReader()
+    documents = get_pdf_files(args.pdf_dir, loader)
+    client = weaviate.Client(
+        url=os.environ["WEAVIATE_URL"],
+        auth_client_secret=weaviate.AuthApiKey(api_key=os.environ["WEAVIATE_API_KEY"]),
+        additional_headers={
+            "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]
+        }
+    )
+    # construct vector store
+    vector_store = WeaviateVectorStore(weaviate_client=client, index_name=args.customer, text_key="content")
+    # setting up the storage for the embeddings
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    # set up the index
+    index = VectorStoreIndex(documents, storage_context=storage_context)
+    query_engine = index.as_query_engine()
+    response = query_engine.query(args.query)
+    print(response)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Process and query PDF files.')
+    parser.add_argument('--customer', default='Ausy', help='Customer name')
+    parser.add_argument('--pdf_dir', default='./data', help='Directory containing PDFs')
+    parser.add_argument('--query', default='What is CX0 customer exprience office?', help='Query to execute')
+    args = parser.parse_args()
+    main(args)

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[tool.poetry]
+name = "ausy-rag-demo"
+version = "0.1.0"
+description = ""
+authors = ["Vincent Claes <[email protected]>"]
+readme = "README.md"
+packages = [{include = "ausy_rag_demo"}]
+[tool.poetry.dependencies]
+python = "^3.9"
+llama-index = "^0.8.29.post1"
+weaviate-client = "^3.24.1"
+pypdf = "^3.16.1"
+goldenverba = "^0.2.3"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"