File size: 2,245 Bytes
2e98c79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c18b9d6
2e98c79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os
import json
import chromadb

# Initialize a persistent Chroma client
client = chromadb.PersistentClient(path="/home/johannes/Desktop/proj/Datenbank/chroma")

# Create or retrieve a collection for the books
collection = client.get_or_create_collection(name="phil_en", metadata={"hnsw:space": "cosine"})

# Function to safely get metadata, replacing None with "Unknown"
def get_metadata(entry, key):
    return entry.get(key) if entry.get(key) is not None else "Unknown"

# Directory containing the JSON files with pre-computed embeddings
json_dir = "/home/johannes/Desktop/proj/Datenbank/bücher/en/verarbeitet/ready_for_chroma/"

# Function to load JSON data from a file
def load_json_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Get all JSON files in the directory
json_files = [os.path.join(json_dir, file) for file in os.listdir(json_dir) if file.endswith('.json')]

# Loop through each file, read the data, and add it to the collection
for file_path in json_files:
    try:
        data = load_json_data(file_path)
        documents = []
        embeddings = []
        metadatas = []
        ids = []

        # Extract entry information and embeddings from each object in the JSON file
        for entry in data:
            documents.append(entry['text'])
            embeddings.append(entry['embedding'])  # Assume embeddings are stored under the key 'embedding'
            metadatas.append({
                'author': get_metadata(entry, 'autor'),
                'book': get_metadata(entry, 'buch'),
                'section': get_metadata(entry, 'abschnitt'),
                'title': get_metadata(entry, 'titel')
            })
            # Generating a structured ID for each entry
            entry_number = entry['entry_number']  # Ensure each JSON object has a entry number
            ids.append(f"{entry_number}")

        # Add the entrys to the collection with pre-computed embeddings
        collection.add(documents=documents, embeddings=embeddings, metadatas=metadatas, ids=ids)
        print(f"Added {len(documents)} documents from {os.path.basename(file_path)}")
    except Exception as e:
        print(f"Failed to process {file_path}: {e}")