pdf2dataset

Sleeping

App Files Files Community

Quentin Gallouédec commited on Jun 13

Commit

58e4b18

•

1 Parent(s): 0b649de

app

Browse files

Files changed (1) hide show

app.py +115 -0

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from pypdf import PdfReader
+import re
+import random
+import gradio as gr
+from datasets import Dataset, DatasetDict
+import os
+import pandas as pd
+to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
+to_be_replaced = {
+    "½": "1/2",
+    "–": "-",
+    "‘": "'",
+    "’": "'",
+    "…": "...",
+    "₋": "-",
+    "−": "-",
+    "⓫": "11.",
+    "⓬": "12.",
+    "⓭": "13.",
+    "⓮": "14.",
+    "◦": "°",
+    "❶": "1.",
+    "❷": "2.",
+    "❸": "3.",
+    "❹": "4.",
+    "❺": "5.",
+    "❻": "6.",
+    "❼": "7.",
+    "❽": "8.",
+    "❾": "9.",
+    "❿": "10.",
+    "\n": " ",
+}
+def clean(text):
+    # Remove all the unwanted characters
+    for char in to_be_removed:
+        text = text.replace(char, "")
+    # Replace all the characters that need to be replaced
+    for char, replacement in to_be_replaced.items():
+        text = text.replace(char, replacement)
+    # For all \n, if the next line doesn't start with a capital letter, remove the \n
+    # text = re.sub(r"\n([^A-ZÀ-ÖØ-Þ])", r" \1", text)
+    # Make sure that every "." is followed by a space
+    text = re.sub(r"\.([^ ])", r". \1", text)
+    # Add a space between a lowercase followed by an uppercase "aA" -> "a A" (include accents)
+    text = re.sub(r"([a-zà-öø-ÿ])([A-ZÀ-ÖØ-Þ])", r"\1 \2", text)
+    # Make sure that there is no space before a comma and a period
+    text = text.replace(" ,", ",")
+    text = text.replace(" .", ".")
+    text = text.replace(" -", "-")
+    text = text.replace("- ", "-")
+    while "  " in text:
+        text = text.replace("  ", " ")
+    return text
+def pdf2dataset(file, _, progress=gr.Progress()):
+    progress(0, desc="Starting...")
+    reader = PdfReader(file)
+    num_pages = len(reader.pages)
+    dataset_name = f"{random.getrandbits(128):x}"
+    page_texts = []
+    for page in progress.tqdm(reader.pages, total=num_pages, desc="Converting pages"):
+        page_text = page.extract_text()
+        page_text = clean(page_text)
+        page_texts.append(page_text)
+    progress(0, desc="Uploading to Hugging Face...")
+    dataset = Dataset.from_dict({"text": page_texts})
+    dataset.push_to_hub(f"pdf2dataset/{dataset_name}", token=os.getenv("TOKEN"))
+    progress(1, desc="Done!")
+    instrctions = f"""
+Your dataset is now available on Hugging Face Datasets at [pdf2dataset/{dataset_name}](https://huggingface.co/datasets/pdf2dataset/{dataset_name}).
+You can load the dataset using the following code:
+```python
+from datasets import load_dataset
+dataset = load_dataset("pdf2dataset/{dataset_name}")
+```
+    """
+    preview = dataset["text"][:10]
+    preview = pd.DataFrame(preview, columns=["text"])
+    return instrctions, preview
+demo = gr.Interface(
+    title="PDF to 🤗 Dataset",
+    fn=pdf2dataset,
+    inputs=[
+        gr.File(file_types=["pdf"]),
+        gr.Markdown(
+            "⚠️ Caution: This process will upload your data to a public Hugging Face repository. Do not upload sensitive information."
+        ),
+    ],
+    outputs=[gr.Markdown(), gr.Dataframe(label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True)],
+    submit_btn="Convert to dataset",
+    allow_flagging="never",
+)
+demo.launch()