Spaces:
Runtime error
Runtime error
import requests | |
import re | |
from html import unescape | |
from sentence_transformers import SentenceTransformer | |
import chromadb | |
import yaml | |
try: | |
# Attempt to load configuration data from config.yaml file | |
with open("./config.yaml", 'r') as file: | |
config_data = yaml.safe_load(file) | |
except Exception as e: | |
# Raise exception if config.yaml file is not found | |
raise Exception(f"Not able to find the file ./config.yaml") | |
# function to fetch data from WordPress site | |
def fetch_wordpress_data(site_url): | |
""" | |
Fetches data from a WordPress site using its REST API. | |
Args: | |
site_url (str): The URL of the WordPress site. | |
Returns: | |
dict: JSON data retrieved from the WordPress site. | |
""" | |
api_url = f"{site_url}/wp-json/wp/v2/posts" | |
try: | |
# Send GET request to WordPress API | |
response = requests.get(api_url) | |
response.raise_for_status() # Raise exception for unsuccessful responses | |
# Extract and return JSON data from response | |
return response.json() | |
except requests.exceptions.RequestException as e: | |
# Handle any errors that occur during request | |
print("Error fetching WordPress data:", e) | |
return None | |
def preprocess_text(text): | |
""" | |
Preprocesses text by removing HTML tags, decoding special characters, and removing extra whitespaces. | |
Args: | |
text (str): The text to be preprocessed. | |
Returns: | |
str: The preprocessed text. | |
""" | |
# Remove HTML tags | |
clean_text = re.sub('<.*?>', '', text) | |
# Decode special characters | |
clean_text = unescape(clean_text) | |
# Removing extra newline characters | |
clean_text = re.sub('\n+', '\n', clean_text) | |
# Remove extra whitespaces and newline characters | |
clean_text = clean_text.strip() | |
return clean_text | |
def generate_embeddings(text): | |
""" | |
Generates sentence embeddings using a pre-trained embedding model. | |
Args: | |
text (str): The input text. | |
Returns: | |
list: List of sentence embeddings. | |
""" | |
# Load pre-trained embedding model | |
model = SentenceTransformer(config_data['embedding_model']) | |
# Generate embeddings for input text | |
embeddings = model.encode(text) | |
return embeddings.tolist() | |
def extract_text(post): | |
""" | |
Extracts and preprocesses text content from a WordPress post. | |
Args: | |
post (dict): The WordPress post data. | |
Returns: | |
str: The preprocessed text content of the post. | |
""" | |
return preprocess_text(post['content']['rendered']) | |
def create_vector_store_and_add_posts(wordpress_data): | |
""" | |
Creates a vector store in Chroma database and adds WordPress posts to it. | |
Args: | |
wordpress_data (list): List of WordPress post data. | |
Returns: | |
tuple: A tuple containing the Chroma client and collection objects. | |
""" | |
client = chromadb.PersistentClient("./posts_db") | |
collection = client.get_or_create_collection(name = config_data['collection_name'], metadata={"hnsw:space": "cosine"}) | |
ids = [] | |
documents = [] | |
metadatas = [] | |
embeddings = [] | |
for post in wordpress_data: | |
ids.append(str(post['id'])) | |
cleaned_content = extract_text(post) | |
embeddings.append(generate_embeddings(cleaned_content)) | |
documents.append(cleaned_content) | |
metadata = {} | |
metadata['title'] = post['title']['rendered'] | |
metadata['date'] = post['date'] | |
metadata['modified'] = post['modified'] | |
metadatas.append(metadata) | |
collection.upsert(ids=ids, documents=documents, metadatas=metadatas, embeddings=embeddings) | |
return client,collection | |