File size: 3,745 Bytes
3b12eab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import requests
import re
from html import unescape
from sentence_transformers import SentenceTransformer
import chromadb
import yaml


try:
    # Attempt to load configuration data from config.yaml file
    with open("./config.yaml", 'r') as file:
        config_data = yaml.safe_load(file)
except Exception as e:
    # Raise exception if config.yaml file is not found
    raise Exception(f"Not able to find the file ./config.yaml")


# function to fetch data from WordPress site
def fetch_wordpress_data(site_url):
    """

    Fetches data from a WordPress site using its REST API.



    Args:

    site_url (str): The URL of the WordPress site.



    Returns:

    dict: JSON data retrieved from the WordPress site.

    """
    api_url = f"{site_url}/wp-json/wp/v2/posts"
    try:
        # Send GET request to WordPress API
        response = requests.get(api_url)
        response.raise_for_status()  # Raise exception for unsuccessful responses

        # Extract and return JSON data from response
        return response.json()
    
    except requests.exceptions.RequestException as e:
        # Handle any errors that occur during request
        print("Error fetching WordPress data:", e)
        return None

def preprocess_text(text):
    """

    Preprocesses text by removing HTML tags, decoding special characters, and removing extra whitespaces.



    Args:

    text (str): The text to be preprocessed.



    Returns:

    str: The preprocessed text.

    """
    # Remove HTML tags
    clean_text = re.sub('<.*?>', '', text)    
    # Decode special characters
    clean_text = unescape(clean_text)
    # Removing extra newline characters
    clean_text = re.sub('\n+', '\n', clean_text)    
    # Remove extra whitespaces and newline characters
    clean_text = clean_text.strip()
    
    return clean_text

def generate_embeddings(text):
    """

    Generates sentence embeddings using a pre-trained embedding model.



    Args:

    text (str): The input text.



    Returns:

    list: List of sentence embeddings.

    """
    # Load pre-trained embedding model
    model = SentenceTransformer(config_data['embedding_model'])

    # Generate embeddings for input text
    embeddings = model.encode(text)
    return embeddings.tolist()

def extract_text(post):
    """

    Extracts and preprocesses text content from a WordPress post.



    Args:

    post (dict): The WordPress post data.



    Returns:

    str: The preprocessed text content of the post.

    """
    return preprocess_text(post['content']['rendered'])

def create_vector_store_and_add_posts(wordpress_data):
    """

    Creates a vector store in Chroma database and adds WordPress posts to it.



    Args:

    wordpress_data (list): List of WordPress post data.



    Returns:

    tuple: A tuple containing the Chroma client and collection objects.

    """
    client = chromadb.PersistentClient("./posts_db") 
    collection = client.get_or_create_collection(name = config_data['collection_name'], metadata={"hnsw:space": "cosine"})
    ids = []
    documents = []
    metadatas = []
    embeddings = []
    for post in wordpress_data:
        ids.append(str(post['id']))
        cleaned_content = extract_text(post)
        embeddings.append(generate_embeddings(cleaned_content))
        documents.append(cleaned_content)
        metadata = {}
        metadata['title'] = post['title']['rendered']
        metadata['date'] = post['date']
        metadata['modified'] = post['modified']
        metadatas.append(metadata)
    collection.upsert(ids=ids, documents=documents, metadatas=metadatas, embeddings=embeddings)
    return client,collection