import gradio as gr import requests import os import re API_TOKEN = os.getenv('API_TOKEN') API_URL = "https://api-inference.huggingface.co/models/nasa-impact/nasa-smd-ibm-st-v2" headers = {"Authorization": f"Bearer {API_TOKEN}"} def query_similarity(source_sentence, sentences): payload = { "inputs": { "source_sentence": source_sentence, "sentences": sentences } } response = requests.post(API_URL, headers=headers, json=payload) return response.json() def format_output(response): results = sorted(response, key=lambda x: x['score'], reverse=True) formatted_results = [] for item in results: formatted_results.append(f"Sentence: {item['sentence']}, Score: {item['score']:.4f}") return "\n".join(formatted_results) def split_into_chunks(text, chunk_size=100): sentences = re.split(r'(?<=[.!?]) +', text) # Split text into sentences chunks = [] current_chunk = [] current_length = 0 for sentence in sentences: sentence_length = len(sentence.split()) if current_length + sentence_length > chunk_size: chunks.append(" ".join(current_chunk)) current_chunk = [sentence] current_length = sentence_length else: current_chunk.append(sentence) current_length += sentence_length if current_chunk: chunks.append(" ".join(current_chunk)) return chunks def semantic_search(query, file): if file is not None: document = file.read().decode('utf-8') chunks = split_into_chunks(document) response = query_similarity(query, chunks) return format_output(response) else: return "Please upload a .txt file." # Define Gradio interface iface = gr.Interface( fn=semantic_search, inputs=[ gr.Textbox(lines=2, placeholder="Enter your query here..."), gr.File(file_types=['txt'], label="Upload a .txt file") ], outputs="text", title="Document Semantic Search", description="Input a query and upload a document (.txt) to find the most semantically similar paragraphs or sentences." ) iface.launch()