Spaces:

aquibmoin
/

Semantic-Search-with-IndusST

Running

File size: 2,764 Bytes

78db47d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d008f91
 
 
 
 
 
78db47d
d008f91
 
 
 
 
 
 
 
 
5112962
d008f91
 
 
78db47d
 
65d4a2a
78db47d
 
 
 
65d4a2a
 
 
78db47d
65d4a2a
 
78db47d
65d4a2a
 
78db47d
 
 
 
 
 
b5d8a7a
 
 
 
8bb6871
d008f91
 
8bb6871
 
78db47d
 
 
 
 
8f9f226
8bb6871
78db47d
 
8f9f226
8bb6871
78db47d

import gradio as gr
import requests
import os
import re

API_TOKEN = os.getenv('API_TOKEN')
API_URL = "https://api-inference.huggingface.co/models/nasa-impact/nasa-smd-ibm-st-v2"
headers = {"Authorization": f"Bearer {API_TOKEN}"}

def query_similarity(source_sentence, sentences):
    payload = {
        "inputs": {
            "source_sentence": source_sentence,
            "sentences": sentences
        }
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    
    # Ensure response is JSON
    try:
        return response.json(), sentences
    except json.JSONDecodeError:
        return {"error": "Failed to decode JSON response"}, sentences

def format_output(response, sentences):
    if isinstance(response, list):
        # Pair each score with its corresponding sentence
        results = list(zip(response, sentences))
        # Sort results by score in descending order
        results = sorted(results, key=lambda x: x[0], reverse=True)
        # Format the output
        formatted_results = []
        for score, sentence in results:
            formatted_results.append(f"Sentence: {sentence.strip()}, Score: {score:.4f}\n")
        return "\n".join(formatted_results)
    else:
        return f"Unexpected response format: {response}"

def split_into_chunks(text, chunk_size=100):
    paragraphs = text.split('\n\n')  # Split text into paragraphs
    chunks = []
    current_chunk = []
    current_length = 0

    for paragraph in paragraphs:
        paragraph_length = len(paragraph.split())
        if current_length + paragraph_length > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = [paragraph]
            current_length = paragraph_length
        else:
            current_chunk.append(paragraph)
            current_length += paragraph_length

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def semantic_search(query, file_path):
    if file_path is not None:
        with open(file_path, 'r', encoding='utf-8') as file:
            document = file.read()
        chunks = split_into_chunks(document)
        response, sentences = query_similarity(query, chunks)
        return format_output(response, sentences)
    else:
        return "Please upload a .txt file."

# Define Gradio interface
iface = gr.Interface(
    fn=semantic_search,
    inputs=[
        gr.Textbox(lines=2, label="Input Query", placeholder="Enter your query here..."),
        gr.File(file_types=['txt'], label="Upload a .txt file")
    ],
    outputs="text",
    title="Semantic Search with Indus-ST (demo)",
    description="Input a query and upload a document (.txt) to find the most semantically similar paragraphs or sentences."
)

iface.launch()