Vincent Claes commited on
Commit
8b6eec6
1 Parent(s): 8c6aaca

first rty with verba - not a complete success

Browse files
Files changed (4) hide show
  1. README.md +10 -1
  2. import_data.py +74 -0
  3. poetry.lock +0 -0
  4. pyproject.toml +19 -0
README.md CHANGED
@@ -1 +1,10 @@
1
- # rag-weviate-verba
 
 
 
 
 
 
 
 
 
 
1
+ # Ausy RAG Demo
2
+
3
+ ```bash
4
+ poetry shell
5
+ poetry install
6
+ export OPENAI_API_KEY=<...>
7
+ export VERBA_URL=<...>
8
+ export VERBA_API_KEY=<...>
9
+ verba start --model "gpt-3.5-turbo"
10
+ ```
import_data.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import weaviate
3
+ from llama_index import download_loader
4
+ from llama_index.vector_stores import WeaviateVectorStore
5
+ from llama_index import VectorStoreIndex, StorageContext
6
+ from pathlib import Path
7
+ import argparse
8
+
9
+ def get_pdf_files(base_path, loader):
10
+ """
11
+ Get paths to all PDF files in a directory and its subdirectories.
12
+
13
+ Parameters:
14
+ - base_path (str): The path to the starting directory.
15
+
16
+ Returns:
17
+ - list of str: A list of paths to all PDF files found.
18
+ """
19
+ pdf_paths = []
20
+
21
+ # Check if the base path exists and is a directory
22
+ if not os.path.exists(base_path):
23
+ raise FileNotFoundError(f"The specified base path does not exist: {base_path}")
24
+ if not os.path.isdir(base_path):
25
+ raise NotADirectoryError(f"The specified base_path is not a directory: {base_path}")
26
+
27
+ # Loop through all directories and files starting from the base path
28
+ for root, dirs, files in os.walk(base_path):
29
+ for filename in files:
30
+ # If a file has a .pdf extension, add its path to the list
31
+ if filename.endswith('.pdf'):
32
+ pdf_file = loader.load_data(file=Path(root, filename))
33
+ pdf_paths.extend(pdf_file)
34
+
35
+ return pdf_paths
36
+
37
+
38
+ def main(args):
39
+ PDFReader = download_loader("PDFReader")
40
+ loader = PDFReader()
41
+
42
+ documents = get_pdf_files(args.pdf_dir, loader)
43
+
44
+ client = weaviate.Client(
45
+ url=os.environ["WEAVIATE_URL"],
46
+ auth_client_secret=weaviate.AuthApiKey(api_key=os.environ["WEAVIATE_API_KEY"]),
47
+ additional_headers={
48
+ "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]
49
+ }
50
+ )
51
+
52
+ # construct vector store
53
+ vector_store = WeaviateVectorStore(weaviate_client=client, index_name=args.customer, text_key="content")
54
+
55
+ # setting up the storage for the embeddings
56
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
57
+
58
+ # set up the index
59
+ index = VectorStoreIndex(documents, storage_context=storage_context)
60
+ query_engine = index.as_query_engine()
61
+ response = query_engine.query(args.query)
62
+ print(response)
63
+
64
+
65
+ if __name__ == "__main__":
66
+ parser = argparse.ArgumentParser(description='Process and query PDF files.')
67
+
68
+ parser.add_argument('--customer', default='Ausy', help='Customer name')
69
+ parser.add_argument('--pdf_dir', default='./data', help='Directory containing PDFs')
70
+ parser.add_argument('--query', default='What is CX0 customer exprience office?', help='Query to execute')
71
+
72
+ args = parser.parse_args()
73
+
74
+ main(args)
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "ausy-rag-demo"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Vincent Claes <[email protected]>"]
6
+ readme = "README.md"
7
+ packages = [{include = "ausy_rag_demo"}]
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.9"
11
+ llama-index = "^0.8.29.post1"
12
+ weaviate-client = "^3.24.1"
13
+ pypdf = "^3.16.1"
14
+ goldenverba = "^0.2.3"
15
+
16
+
17
+ [build-system]
18
+ requires = ["poetry-core"]
19
+ build-backend = "poetry.core.masonry.api"