document-parser-rag

Running on Zero

App Files Files Community

Liam Dyer commited on May 30

Commit

83f2c7b

•

1 Parent(s): 15d68b8

letting it rip bud

Browse files

Files changed (2) hide show

app.py +102 -23
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -7,6 +7,34 @@ import string
 import random
 from pypdf import PdfReader
 import ocrmypdf
 def random_word(length):
@@ -14,9 +42,8 @@ def random_word(length):
     return "".join(random.choice(letters) for _ in range(length))
-def convert_pdf(input_file):
     reader = PdfReader(input_file)
-    metadata = extract_metadata_from_pdf(reader)
     text = extract_text_from_pdf(reader)
     # Check if there are any images
@@ -35,7 +62,7 @@ def convert_pdf(input_file):
         # Delete the OCR file
         os.remove(out_pdf_file)
-    return text, metadata
 def extract_text_from_pdf(reader):
@@ -48,17 +75,7 @@ def extract_text_from_pdf(reader):
     return full_text.strip()
-def extract_metadata_from_pdf(reader):
-    return {
-        "author": reader.metadata.author,
-        "creator": reader.metadata.creator,
-        "producer": reader.metadata.producer,
-        "subject": reader.metadata.subject,
-        "title": reader.metadata.title,
-    }
-def convert_pandoc(input_file, filename):
     # Temporarily copy the file
     shutil.copyfile(input_file, filename)
@@ -78,7 +95,7 @@ def convert_pandoc(input_file, filename):
 @spaces.GPU
-def convert(input_file, filename):
     plain_text_filetypes = [
         ".txt",
         ".csv",
@@ -91,23 +108,85 @@ def convert(input_file, filename):
         ".jsonc",
     ]
     # Already a plain text file that wouldn't benefit from pandoc so return the content
-    if any(filename.endswith(ft) for ft in plain_text_filetypes):
         with open(input_file, "r") as f:
-            return f.read(), {}
-    if filename.endswith(".pdf"):
         return convert_pdf(input_file)
-    return convert_pandoc(input_file, filename), {}
 # We accept a filename because the gradio JS interface removes this information
 # and it's critical for choosing the correct processing pipeline
 gr.Interface(
     convert,
-    inputs=[gr.File(label="Upload File", type="filepath"), gr.Text(label="Filename")],
-    outputs=[
-        gr.Text(label="Markdown"),
-        gr.JSON(label="Metadata"),
     ],
 ).launch()

 import random
 from pypdf import PdfReader
 import ocrmypdf
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer("Snowflake/snowflake-arctic-embed-m")
+model.to(device="cuda")
+def chunk(text, max_length=512):
+    chunks = []
+    while len(text) > max_length:
+        chunks.append(text[:max_length])
+        text = text[max_length:]
+    chunks.append(text)
+    return chunks
+@spaces.GPU
+def embed(queries, chunks) -> dict[str, list[tuple[str, float]]]:
+    query_embeddings = model.encode(queries, prompt_name="query")
+    document_embeddings = model.encode(chunks)
+    scores = query_embeddings @ document_embeddings.T
+    results = {}
+    for query, query_scores in zip(queries, scores):
+        chunk_idxs = [i for i in range(len(chunks))]
+        # Get a structure like {query: [(chunk_idx, score), (chunk_idx, score), ...]}
+        results[query] = list(zip(chunk_idxs, query_scores))
+    return results
 def random_word(length):
     return "".join(random.choice(letters) for _ in range(length))
+def convert_pdf(input_file) -> str:
     reader = PdfReader(input_file)
     text = extract_text_from_pdf(reader)
     # Check if there are any images
         # Delete the OCR file
         os.remove(out_pdf_file)
+    return text
 def extract_text_from_pdf(reader):
     return full_text.strip()
+def convert_pandoc(input_file, filename) -> str:
     # Temporarily copy the file
     shutil.copyfile(input_file, filename)
 @spaces.GPU
+def convert(input_file) -> str:
     plain_text_filetypes = [
         ".txt",
         ".csv",
         ".jsonc",
     ]
     # Already a plain text file that wouldn't benefit from pandoc so return the content
+    if any(input_file.endswith(ft) for ft in plain_text_filetypes):
         with open(input_file, "r") as f:
+            return f.read()
+    if input_file.endswith(".pdf"):
         return convert_pdf(input_file)
+    return convert_pandoc(input_file, input_file)
+@spaces.GPU
+def predict(queries, documents, max_characters) -> list[list[str]]:
+    queries = queries.split("\n")
+    # Conver the documents to text
+    converted_docs = [convert(doc) for doc in documents]
+    # Return if the total length is less than the max characters
+    total_doc_lengths = sum([len(doc) for doc, _ in converted_docs])
+    if total_doc_lengths < max_characters:
+        return [[doc] for doc, _ in converted_docs]
+    # Embed the documents in 512 character chunks
+    chunked_docs = [chunk(doc, 512) for doc in converted_docs]
+    embedded_docs = [embed(queries, chunks) for chunks in chunked_docs]
+    # Get a structure like {query: [(doc_idx, chunk_idx, score), (doc_idx, chunk_idx, score), ...]}
+    query_embeddings = {}
+    for doc_idx, embedded_doc in enumerate(embedded_docs):
+        for query, doc_scores in embedded_doc.items():
+            doc_scores_with_doc = [
+                (doc_idx, chunk_idx, score) for (chunk_idx, score) in doc_scores
+            ]
+            if query not in query_embeddings:
+                query_embeddings[query] = []
+            query_embeddings[query] = query_embeddings[query] + doc_scores_with_doc
+    # Sort the embeddings by score
+    for query, doc_scores in query_embeddings.items():
+        query_embeddings[query] = sorted(doc_scores, key=lambda x: x[2], reverse=True)
+    # Choose the top embedding from each query until we reach the max characters
+    # Getting a structure like [[chunk, ...]]
+    document_embeddings = [[] for _ in range(len(documents))]
+    total_chars = 0
+    while total_chars < max_characters:
+        for query, doc_scores in query_embeddings.items():
+            if len(doc_scores) == 0:
+                continue
+            # Grab the top score for the query
+            doc_idx, chunk_idx, _ = doc_scores.pop(0)
+            if doc_idx not in document_embeddings:
+                document_embeddings[doc_idx] = []
+            # Ensure we have space
+            chunk = chunked_docs[doc_idx][chunk_idx]
+            if total_chars + len(chunk) > max_characters:
+                continue
+            # Ensure we haven't already added this chunk from this document
+            if chunk_idx in document_embeddings[doc_idx]:
+                continue
+            # Add the chunk
+            document_embeddings[doc_idx].append(chunk_idx)
+            total_chars += len(chunk)
+    return document_embeddings
 # We accept a filename because the gradio JS interface removes this information
 # and it's critical for choosing the correct processing pipeline
 gr.Interface(
     convert,
+    inputs=[
+        gr.Textbox(label="Queries separated by newline"),
+        gr.Files(label="Upload File"),
+        gr.Number(label="Max output characters", value=16384),
     ],
+    outputs=[gr.JSON(label="Embedded documents")],
 ).launch()

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 ocrmypdf==16.3.1
 pypdf==4.2.0

 ocrmypdf==16.3.1
 pypdf==4.2.0
+sentence-transformers==3.0.0