File size: 2,764 Bytes
78db47d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d008f91
 
 
 
 
 
78db47d
d008f91
 
 
 
 
 
 
 
 
5112962
d008f91
 
 
78db47d
 
65d4a2a
78db47d
 
 
 
65d4a2a
 
 
78db47d
65d4a2a
 
78db47d
65d4a2a
 
78db47d
 
 
 
 
 
b5d8a7a
 
 
 
8bb6871
d008f91
 
8bb6871
 
78db47d
 
 
 
 
8f9f226
8bb6871
78db47d
 
8f9f226
8bb6871
78db47d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
import requests
import os
import re

API_TOKEN = os.getenv('API_TOKEN')
API_URL = "https://api-inference.huggingface.co/models/nasa-impact/nasa-smd-ibm-st-v2"
headers = {"Authorization": f"Bearer {API_TOKEN}"}

def query_similarity(source_sentence, sentences):
    payload = {
        "inputs": {
            "source_sentence": source_sentence,
            "sentences": sentences
        }
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    
    # Ensure response is JSON
    try:
        return response.json(), sentences
    except json.JSONDecodeError:
        return {"error": "Failed to decode JSON response"}, sentences

def format_output(response, sentences):
    if isinstance(response, list):
        # Pair each score with its corresponding sentence
        results = list(zip(response, sentences))
        # Sort results by score in descending order
        results = sorted(results, key=lambda x: x[0], reverse=True)
        # Format the output
        formatted_results = []
        for score, sentence in results:
            formatted_results.append(f"Sentence: {sentence.strip()}, Score: {score:.4f}\n")
        return "\n".join(formatted_results)
    else:
        return f"Unexpected response format: {response}"

def split_into_chunks(text, chunk_size=100):
    paragraphs = text.split('\n\n')  # Split text into paragraphs
    chunks = []
    current_chunk = []
    current_length = 0

    for paragraph in paragraphs:
        paragraph_length = len(paragraph.split())
        if current_length + paragraph_length > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = [paragraph]
            current_length = paragraph_length
        else:
            current_chunk.append(paragraph)
            current_length += paragraph_length

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def semantic_search(query, file_path):
    if file_path is not None:
        with open(file_path, 'r', encoding='utf-8') as file:
            document = file.read()
        chunks = split_into_chunks(document)
        response, sentences = query_similarity(query, chunks)
        return format_output(response, sentences)
    else:
        return "Please upload a .txt file."

# Define Gradio interface
iface = gr.Interface(
    fn=semantic_search,
    inputs=[
        gr.Textbox(lines=2, label="Input Query", placeholder="Enter your query here..."),
        gr.File(file_types=['txt'], label="Upload a .txt file")
    ],
    outputs="text",
    title="Semantic Search with Indus-ST (demo)",
    description="Input a query and upload a document (.txt) to find the most semantically similar paragraphs or sentences."
)

iface.launch()