|
import faiss |
|
import shutil |
|
from beartype import beartype |
|
import numpy as np |
|
import json |
|
import argparse |
|
from zsvision.zs_utils import BlockTimer |
|
import tiktoken |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
import multiprocessing as mp |
|
from zsvision.zs_multiproc import starmap_with_kwargs |
|
from llm_api_utils import init_openai_with_api_key, PRICE_PER_1K_TOKENS |
|
import multiprocessing as mp |
|
from zsvision.zs_multiproc import starmap_with_kwargs |
|
from langchain_community.vectorstores import FAISS |
|
from langchain_openai import OpenAIEmbeddings |
|
from langchain.docstore.in_memory import InMemoryDocstore |
|
|
|
|
|
class ClaimToEvidence: |
|
def __init__( |
|
self, |
|
embedding_model="ada", |
|
limit=0, |
|
refresh=False, |
|
processes=1, |
|
num_chunks_per_worker=50, |
|
filter_str="", |
|
text_embedding_chunk_size=500, |
|
k_nearest_neighbours=3, |
|
): |
|
self.embedding_model = embedding_model |
|
self.limit = limit |
|
self.refresh = refresh |
|
self.processes = processes |
|
self.num_chunks_per_worker = num_chunks_per_worker |
|
self.filter_str = filter_str |
|
self.text_embedding_chunk_size = text_embedding_chunk_size |
|
self.k_nearest_neighbours = k_nearest_neighbours |
|
|
|
@beartype |
|
def link_claims_to_evidence( |
|
self, |
|
metas, |
|
faiss_db, |
|
): |
|
embedding_function = OpenAIEmbeddings() |
|
|
|
|
|
queries = [ |
|
f"Evidence for {x['claim']} (Based on {x['verbatim_quote']})" for x in metas |
|
] |
|
encoding = tiktoken.encoding_for_model(self.embedding_model) |
|
|
|
num_tokens = len(encoding.encode(" ".join(queries))) |
|
print( |
|
f"Step6: Estimated cost: {num_tokens * PRICE_PER_1K_TOKENS[self.embedding_model]['embed'] / 1000:.2f} USD" |
|
) |
|
k_nearest_neighbours = min( |
|
len(faiss_db.index_to_docstore_id), self.k_nearest_neighbours |
|
) |
|
|
|
for text_query, meta in zip(queries, metas): |
|
docs_and_scores = faiss_db.similarity_search_with_relevance_scores( |
|
text_query, k=k_nearest_neighbours |
|
) |
|
|
|
|
|
evidences = [] |
|
for document, score in docs_and_scores: |
|
evidence = { |
|
"chunk_tag": document.metadata["chunk_tag"], |
|
"link": document.metadata["link"], |
|
"query": document.metadata["query"], |
|
"date_accessed": document.metadata["date_accessed"], |
|
"text": document.page_content, |
|
"similarity_score": float(score), |
|
} |
|
evidences.append(evidence) |
|
|
|
meta["evidences"] = evidences |
|
meta["embedded_query_used_to_find_evidence"] = text_query |
|
|
|
print(f"Returning {len(metas)} queries with supporting evidence") |
|
return metas |
|
|