import os import weaviate from llama_index import download_loader from llama_index.vector_stores import WeaviateVectorStore from llama_index import VectorStoreIndex, StorageContext from pathlib import Path import argparse def get_pdf_files(base_path, loader): """ Get paths to all PDF files in a directory and its subdirectories. Parameters: - base_path (str): The path to the starting directory. Returns: - list of str: A list of paths to all PDF files found. """ pdf_paths = [] # Check if the base path exists and is a directory if not os.path.exists(base_path): raise FileNotFoundError(f"The specified base path does not exist: {base_path}") if not os.path.isdir(base_path): raise NotADirectoryError(f"The specified base_path is not a directory: {base_path}") # Loop through all directories and files starting from the base path for root, dirs, files in os.walk(base_path): for filename in files: # If a file has a .pdf extension, add its path to the list if filename.endswith('.pdf'): pdf_file = loader.load_data(file=Path(root, filename)) pdf_paths.extend(pdf_file) return pdf_paths def main(args): PDFReader = download_loader("PDFReader") loader = PDFReader() documents = get_pdf_files(args.pdf_dir, loader) client = weaviate.Client( url=os.environ["WEAVIATE_URL"], auth_client_secret=weaviate.AuthApiKey(api_key=os.environ["WEAVIATE_API_KEY"]), additional_headers={ "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"] } ) # construct vector store vector_store = WeaviateVectorStore(weaviate_client=client, index_name=args.customer, text_key="content") # setting up the storage for the embeddings storage_context = StorageContext.from_defaults(vector_store=vector_store) # set up the index index = VectorStoreIndex(documents, storage_context=storage_context) query_engine = index.as_query_engine() response = query_engine.query(args.query) print(response) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Process and query PDF files.') parser.add_argument('--customer', default='Ausy', help='Customer name') parser.add_argument('--pdf_dir', default='./data', help='Directory containing PDFs') parser.add_argument('--query', default='What is CX0 customer exprience office?', help='Query to execute') args = parser.parse_args() main(args)