Spaces:

GIZ
/

audit_assistant

Running on CPU Upgrade

ppsingh commited on Aug 7

Commit

1598ceb

•

1 Parent(s): bc697d2

Update auditqa/doc_process.py

Files changed (1) hide show

auditqa/doc_process.py CHANGED Viewed

@@ -31,7 +31,7 @@ def process_pdf():
     # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
     chunk_size = 256
     text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
-            AutoTokenizer.from_pretrained("BAAI/bge-en-icl"),
             chunk_size=chunk_size,
             chunk_overlap=10,
             add_start_index=True,
@@ -78,7 +78,7 @@ def process_pdf():
     embeddings = HuggingFaceEmbeddings(
         model_kwargs = {'device': device},
         encode_kwargs = {'normalize_embeddings': True},
-        model_name="BAAI/bge-en-icl"
     )
     # placeholder for collection
     qdrant_collections = {}

     # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
     chunk_size = 256
     text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
+            AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5"),
             chunk_size=chunk_size,
             chunk_overlap=10,
             add_start_index=True,
     embeddings = HuggingFaceEmbeddings(
         model_kwargs = {'device': device},
         encode_kwargs = {'normalize_embeddings': True},
+        model_name="BAAI/bge-large-en-v1.5"
     )
     # placeholder for collection
     qdrant_collections = {}