Spaces:
Paused
Paused
File size: 2,667 Bytes
3ab64ac 16ff9c3 3ab64ac 2d17ff2 22b7264 16ff9c3 68cba5e 2d17ff2 68cba5e 3ab64ac 7f1736e 86a6762 3ab64ac 8edd3eb 1f81843 8edd3eb 9ca031b 8edd3eb 7ebdd15 8edd3eb 1593f40 7ebdd15 6240195 1593f40 24a7885 1593f40 68cba5e 16ff9c3 6240195 22b7264 1593f40 22b7264 1593f40 a6f29ed 24a7885 22b7264 e491ae1 4af9e36 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import glob
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from transformers import AutoTokenizer
from torch import cuda
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import Qdrant
device = 'cuda' if cuda.is_available() else 'cpu'
#from dotenv import load_dotenv
#load_dotenv()
#HF_token = os.environ["HF_TOKEN"]
path_to_data = "./data/"
def process_pdf():
files = {'MWTS2021':'./data/MWTS2021.pdf',
'MWTS2022':'./data/MWTS2022.pdf',
'Consolidated2021':'./data/Consolidated2021.pdf'}
docs = {}
for file,value in files.items():
try:
docs[file] = PyMuPDFLoader(value).load()
except Exception as e:
print("Exception: ", e)
# text splitter based on the tokenizer of a model of your choosing
# to make texts fit exactly a transformer's context window size
# langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
chunk_size = 256
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
chunk_size=chunk_size,
chunk_overlap=10,
add_start_index=True,
strip_whitespace=True,
separators=["\n\n", "\n"],
)
all_documents = {'Consolidated':[], 'MWTS':[]}
for file,value in docs.items():
doc_processed = text_splitter.split_documents(value)
for doc in doc_processed:
doc.metadata["source"] = file
doc.metadata["year"] = file[-4:]
for key in all_documents:
if key in file:
print(key)
all_documents[key].append(doc_processed)
for key, docs_processed in all_documents.items():
docs_processed = [item for sublist in docs_processed for item in sublist]
all_documents[key] = docs_processed
embeddings = HuggingFaceEmbeddings(
model_kwargs = {'device': device},
encode_kwargs = {'normalize_embeddings': True},
model_name="BAAI/bge-small-en-v1.5"
)
qdrant_collections = {}
for file,value in all_documents.items():
print("emebddings for:",file)
qdrant_collections[file] = Qdrant.from_documents(
value,
embeddings,
location=":memory:",
collection_name=file,
)
print("done")
return qdrant_collections |