|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import datetime
|
|
from datetime import datetime
|
|
import gradio as gr
|
|
import json
|
|
import os
|
|
import logging
|
|
import requests
|
|
|
|
from tqdm import tqdm
|
|
|
|
from App_Function_Libraries.Utils.Utils import sanitize_filename
|
|
|
|
from Article_Extractor_Lib import scrape_article
|
|
from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \
|
|
summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm
|
|
from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \
|
|
summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \
|
|
summarize_with_mistral
|
|
from App_Function_Libraries.DB.DB_Manager import ingest_article_to_db
|
|
|
|
|
|
|
|
|
|
|
|
def scrape_and_summarize_multiple(urls, custom_prompt_arg, api_name, api_key, keywords, custom_article_titles, system_message=None):
|
|
urls = [url.strip() for url in urls.split('\n') if url.strip()]
|
|
custom_titles = custom_article_titles.split('\n') if custom_article_titles else []
|
|
|
|
results = []
|
|
errors = []
|
|
|
|
|
|
progress = gr.Progress()
|
|
|
|
for i, url in tqdm(enumerate(urls), total=len(urls), desc="Processing URLs"):
|
|
custom_title = custom_titles[i] if i < len(custom_titles) else None
|
|
try:
|
|
article = scrape_article(url)
|
|
if article and article['extraction_successful']:
|
|
if custom_title:
|
|
article['title'] = custom_title
|
|
results.append(article)
|
|
except Exception as e:
|
|
error_message = f"Error processing URL {i + 1} ({url}): {str(e)}"
|
|
errors.append(error_message)
|
|
|
|
|
|
progress((i + 1) / len(urls), desc=f"Processed {i + 1}/{len(urls)} URLs")
|
|
|
|
if errors:
|
|
logging.error("\n".join(errors))
|
|
|
|
return results
|
|
|
|
|
|
def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title, system_message=None):
|
|
try:
|
|
|
|
article_data = scrape_article(url)
|
|
print(f"Scraped Article Data: {article_data}")
|
|
if not article_data:
|
|
return "Failed to scrape the article."
|
|
|
|
|
|
title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled')
|
|
author = article_data.get('author', 'Unknown')
|
|
content = article_data.get('content', '')
|
|
ingestion_date = datetime.now().strftime('%Y-%m-%d')
|
|
|
|
print(f"Title: {title}, Author: {author}, Content Length: {len(content)}")
|
|
|
|
|
|
system_message = system_message or "Act as a professional summarizer and summarize this article."
|
|
|
|
article_custom_prompt = custom_prompt_arg or "Act as a professional summarizer and summarize this article."
|
|
|
|
|
|
summary = None
|
|
if api_name:
|
|
logging.debug(f"Article_Summarizer: Summarization being performed by {api_name}")
|
|
|
|
|
|
sanitized_title = sanitize_filename(title)
|
|
json_file_path = os.path.join("Results", f"{sanitized_title}_segments.json")
|
|
|
|
with open(json_file_path, 'w') as json_file:
|
|
json.dump([{'text': content}], json_file, indent=2)
|
|
|
|
|
|
try:
|
|
if api_name.lower() == 'openai':
|
|
|
|
summary = summarize_with_openai(api_key, json_file_path, article_custom_prompt, system_message)
|
|
|
|
elif api_name.lower() == "anthropic":
|
|
|
|
summary = summarize_with_anthropic(api_key, json_file_path, article_custom_prompt, system_message)
|
|
elif api_name.lower() == "cohere":
|
|
|
|
summary = summarize_with_cohere(api_key, json_file_path, article_custom_prompt, system_message)
|
|
|
|
elif api_name.lower() == "groq":
|
|
logging.debug(f"MAIN: Trying to summarize with groq")
|
|
|
|
summary = summarize_with_groq(api_key, json_file_path, article_custom_prompt, system_message)
|
|
|
|
elif api_name.lower() == "openrouter":
|
|
logging.debug(f"MAIN: Trying to summarize with OpenRouter")
|
|
|
|
summary = summarize_with_openrouter(api_key, json_file_path, article_custom_prompt, system_message)
|
|
|
|
elif api_name.lower() == "deepseek":
|
|
logging.debug(f"MAIN: Trying to summarize with DeepSeek")
|
|
|
|
summary = summarize_with_deepseek(api_key, json_file_path, article_custom_prompt, system_message)
|
|
|
|
elif api_name.lower() == "mistral":
|
|
summary = summarize_with_mistral(api_key, json_file_path, article_custom_prompt, system_message)
|
|
|
|
elif api_name.lower() == "llama.cpp":
|
|
logging.debug(f"MAIN: Trying to summarize with Llama.cpp")
|
|
|
|
summary = summarize_with_llama(json_file_path, article_custom_prompt, system_message)
|
|
|
|
elif api_name.lower() == "kobold":
|
|
logging.debug(f"MAIN: Trying to summarize with Kobold.cpp")
|
|
|
|
summary = summarize_with_kobold(json_file_path, api_key, article_custom_prompt, system_message)
|
|
|
|
elif api_name.lower() == "ooba":
|
|
|
|
summary = summarize_with_oobabooga(json_file_path, api_key, article_custom_prompt, system_message)
|
|
|
|
elif api_name.lower() == "tabbyapi":
|
|
|
|
summary = summarize_with_tabbyapi(json_file_path, article_custom_prompt, system_message)
|
|
|
|
elif api_name.lower() == "vllm":
|
|
logging.debug(f"MAIN: Trying to summarize with VLLM")
|
|
|
|
summary = summarize_with_vllm(json_file_path, article_custom_prompt, system_message)
|
|
|
|
elif api_name.lower() == "local-llm":
|
|
logging.debug(f"MAIN: Trying to summarize with Local LLM")
|
|
summary = summarize_with_local_llm(json_file_path, article_custom_prompt, system_message)
|
|
|
|
elif api_name.lower() == "huggingface":
|
|
logging.debug(f"MAIN: Trying to summarize with huggingface")
|
|
|
|
summarize_with_huggingface(api_key, json_file_path, article_custom_prompt, system_message)
|
|
|
|
except requests.exceptions.ConnectionError as e:
|
|
logging.error(f"Connection error while trying to summarize with {api_name}: {str(e)}")
|
|
|
|
if summary:
|
|
logging.info(f"Article_Summarizer: Summary generated using {api_name} API")
|
|
save_summary_to_file(summary, json_file_path)
|
|
else:
|
|
summary = "Summary not available"
|
|
logging.warning(f"Failed to generate summary using {api_name} API")
|
|
|
|
else:
|
|
summary = "Article Summarization: No API provided for summarization."
|
|
|
|
print(f"Summary: {summary}")
|
|
|
|
|
|
ingestion_result = ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date,
|
|
article_custom_prompt)
|
|
|
|
return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nSummary: {summary}\n\nArticle Contents: {content}"
|
|
except Exception as e:
|
|
logging.error(f"Error processing URL {url}: {str(e)}")
|
|
return f"Failed to process URL {url}: {str(e)}"
|
|
|
|
|
|
def scrape_and_no_summarize_then_ingest(url, keywords, custom_article_title):
|
|
try:
|
|
|
|
article_data = scrape_article(url)
|
|
print(f"Scraped Article Data: {article_data}")
|
|
if not article_data:
|
|
return "Failed to scrape the article."
|
|
|
|
|
|
title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled')
|
|
author = article_data.get('author', 'Unknown')
|
|
content = article_data.get('content', '')
|
|
ingestion_date = datetime.now().strftime('%Y-%m-%d')
|
|
|
|
print(f"Title: {title}, Author: {author}, Content Length: {len(content)}")
|
|
|
|
|
|
ingestion_result = ingest_article_to_db(url, title, author, content, keywords, ingestion_date, None, None)
|
|
|
|
return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nArticle Contents: {content}"
|
|
except Exception as e:
|
|
logging.error(f"Error processing URL {url}: {str(e)}")
|
|
return f"Failed to process URL {url}: {str(e)}"
|
|
|
|
|
|
def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title, system_message=None):
|
|
title = custom_article_title.strip() if custom_article_title else "Unstructured Text"
|
|
author = "Unknown"
|
|
ingestion_date = datetime.now().strftime('%Y-%m-%d')
|
|
|
|
|
|
if api_name:
|
|
json_file_path = f"Results/{title.replace(' ', '_')}_segments.json"
|
|
with open(json_file_path, 'w') as json_file:
|
|
json.dump([{'text': text}], json_file, indent=2)
|
|
|
|
if api_name.lower() == 'openai':
|
|
summary = summarize_with_openai(api_key, json_file_path, custom_prompt, system_message)
|
|
|
|
else:
|
|
summary = "Unsupported API."
|
|
else:
|
|
summary = "No API provided for summarization."
|
|
|
|
|
|
ingestion_result = ingest_article_to_db('Unstructured Text', title, author, text, keywords, summary, ingestion_date,
|
|
custom_prompt)
|
|
return f"Title: {title}\nSummary: {summary}\nIngestion Result: {ingestion_result}"
|
|
|
|
|
|
|
|
|
|
|
|
|