|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
|
from sentence_transformers import SentenceTransformer, util
|
|
import torch
|
|
|
|
|
|
model_name = "facebook/bart-large-cnn"
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
|
|
|
|
|
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
|
|
|
|
|
|
similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
|
|
|
|
|
|
|
def evaluate_retrieval(query, retrieved_docs):
|
|
"""
|
|
Evaluate the relevance of retrieved documents using cosine similarity
|
|
with sentence embeddings.
|
|
"""
|
|
query_embedding = similarity_model.encode(query, convert_to_tensor=True)
|
|
doc_embeddings = similarity_model.encode(retrieved_docs, convert_to_tensor=True)
|
|
|
|
|
|
similarities = [util.pytorch_cos_sim(query_embedding, doc_embedding).item() for doc_embedding in doc_embeddings]
|
|
|
|
|
|
relevance_threshold = 0.5
|
|
relevance_scores = ['Correct' if sim > relevance_threshold else 'Incorrect' for sim in similarities]
|
|
|
|
return relevance_scores
|
|
|
|
|
|
|
|
def decompose_then_recompose(retrieved_docs):
|
|
"""
|
|
Refine the retrieved documents by summarizing their key information.
|
|
"""
|
|
refined_knowledge = []
|
|
for doc in retrieved_docs:
|
|
summary = summarizer(doc, max_length=50, min_length=20, do_sample=False)[0]['summary_text']
|
|
refined_knowledge.append(summary)
|
|
return refined_knowledge
|
|
|
|
|
|
|
|
def web_search(query):
|
|
"""
|
|
Perform a web search to retrieve additional external knowledge if the
|
|
retrieved documents are not relevant.
|
|
"""
|
|
search_url = f"https://www.google.com/search?q={query.replace(' ', '+')}"
|
|
headers = {'User-Agent': 'Mozilla/5.0'}
|
|
response = requests.get(search_url, headers=headers)
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
links = []
|
|
for item in soup.find_all('a'):
|
|
link = item.get('href')
|
|
if link and "http" in link:
|
|
links.append(link)
|
|
return links[:5]
|
|
|
|
|
|
|
|
def generate_final_output(query, refined_knowledge):
|
|
"""
|
|
Generate the final output summary using the refined knowledge.
|
|
"""
|
|
combined_knowledge = " ".join(refined_knowledge)
|
|
final_summary = summarizer(combined_knowledge, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
|
|
return final_summary
|
|
|
|
|
|
|
|
def crag_workflow(query, retrieved_docs):
|
|
"""
|
|
Full CRAG workflow integrating evaluation, knowledge refinement,
|
|
and web search to generate a robust output summary.
|
|
"""
|
|
|
|
relevance_scores = evaluate_retrieval(query, retrieved_docs)
|
|
|
|
if 'Correct' in relevance_scores:
|
|
|
|
refined_knowledge = decompose_then_recompose(
|
|
[doc for doc, score in zip(retrieved_docs, relevance_scores) if score == 'Correct'])
|
|
else:
|
|
|
|
web_results = web_search(query)
|
|
refined_knowledge = decompose_then_recompose(web_results)
|
|
|
|
|
|
final_summary = generate_final_output(query, refined_knowledge)
|
|
|
|
return final_summary
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
query = "What are the latest advancements in renewable energy?"
|
|
retrieved_docs = [
|
|
"Renewable energy is becoming increasingly important in today's world...",
|
|
"Solar energy has seen significant advancements in the past decade...",
|
|
"Wind energy technology is rapidly evolving, with new innovations expected soon..."
|
|
]
|
|
|
|
|
|
final_summary = crag_workflow(query, retrieved_docs)
|
|
print("Final Summary:", final_summary)
|
|
|