import os import json import requests from pymilvus import MilvusClient, DataType, Schema, Collection, utility from dotenv import load_dotenv load_dotenv() VERTOPAL_API_KEY = os.getenv("VERTOPAL_API_KEY") ZILLIZ_CLUSTER_ENDPOINT = os.getenv("ZILLIZ_CLUSTER_ENDPOINT") ZILLIZ_TOKEN = os.getenv("ZILLIZ_TOKEN") def convert_pdf_to_json(file_path): url = "https://api.vertopal.com/v1/convert/file" headers = { "Authorization": f"Bearer {VERTOPAL_API_KEY}" } data = { "app": "[APP_ID]", "parameters": { "output": "json" } } files = { "file": open(file_path, "rb") } response = requests.post(url, headers=headers, data=data, files=files) response.raise_for_status() json_data = response.json() return json_data["result"]["output"]["connector"] def download_json_file(connector): url = "https://api.vertopal.com/v1/download/url/get" headers = { "Authorization": f"Bearer {VERTOPAL_API_KEY}" } data = { "app": "[APP_ID]", "connector": connector } response = requests.post(url, headers=headers, data=data) response.raise_for_status() json_data = response.json() return json_data def create_milvus_client_and_collection(collection_name): client = MilvusClient(uri=ZILLIZ_CLUSTER_ENDPOINT, token=ZILLIZ_TOKEN) if utility.has_collection(collection_name): collection = Collection(collection_name) else: schema = Schema(enable_dynamic_field=True, description="") schema.add_field(field_name="primary_key", datatype=DataType.INT64, description="The Primary Key", is_primary=True, auto_id=False) schema.add_field(field_name="json_data", datatype=DataType.VARCHAR, description="JSON Data", max_length=65535) collection = client.create_collection(collection_name, schema=schema) return client, collection def upload_json_to_milvus(json_data, collection_name): client, collection = create_milvus_client_and_collection(collection_name) data = [ (len(collection), json.dumps(json_data)) ] collection.insert(data) def process_pdfs(directory): file_paths = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')] for file_path in file_paths: print(f"Processing file: {file_path}") connector = convert_pdf_to_json(file_path) json_data = download_json_file(connector) upload_json_to_milvus(json_data, "pdf_json_collection") print(f"Uploaded JSON data for file: {file_path}") def upload_persona_json(file_path): with open(file_path, "r") as f: persona_json = json.load(f) upload_json_to_milvus(persona_json, "persona_collection") print("Uploaded persona JSON to Milvus") if __name__ == "__main__": pdf_directory = "L:\\00.Developer Playground\\DEV\\_VS-Code\\_C3P03\\_PG\\DEV\\_HubFaceRag\\_Ilya\\ILYA\\_RAG\\_v2\\ILYA\\_docs\\_RAG\\ILYA\\pdfs" process_pdfs(pdf_directory) persona_json_path = "L:\\00.Developer Playground\\DEV\\_VS-Code\\_C3P03\\_PG\\DEV\\_HubFaceRag\\_Ilya\\ILYA\\_RAG\\_v2\\ILYA\\_docs\\_RAG\\persona.json" upload_persona_json(persona_json_path)