Spaces:

nslaughter
/

flashcard-studio

Sleeping

App Files Files Community

Nathan Slaughter commited on Oct 5

Commit

4d17caa

•

1 Parent(s): b8a0d78

add pytorch manual method

Browse files

Files changed (14) hide show

.github/workflows/python-app.yaml +29 -0
.gitignore +1 -0
app.py +8 -0
app/__init__.py +0 -0
app/interface.py +113 -0
app/models.py +31 -0
app/processing.py +95 -0
environment.yml +19 -0
pytest.ini +5 -0
requirements.txt +7 -0
tests/__init__.py +0 -0
tests/conftest.py +14 -0
tests/test_models.py +20 -0
tests/test_processing.py +73 -0

.github/workflows/python-app.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+# .github/workflows/python-app.yml
+name: Python application
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.8'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install pytest pytest-mock
+    - name: Run tests
+      run: |
+        pytest

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from app.interface import create_interface
+def main():
+    interface = create_interface()
+    interface.launch()
+if __name__ == "__main__":
+    main()

app/__init__.py ADDED Viewed

File without changes

app/interface.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import gradio as gr
+from .models import LanguageModel
+from .processing import process_file, process_text_input
+def create_interface():
+    # Initialize the language model
+    language_model = LanguageModel()
+    # Define the Output Format Selector
+    output_format_selector = gr.Radio(
+        choices=["CSV", "JSON"],
+        label="Select Output Format",
+        value="JSON",
+        type="value"
+    )
+    # Define the Output Flashcards
+    flashcard_output_file = gr.Textbox(
+        label="Flashcards",
+        lines=20,
+        placeholder="Extracted flashcards will appear here..."
+    )
+    flashcard_output_text = gr.Textbox(
+        label="Flashcards",
+        lines=20,
+        placeholder="Extracted flashcards will appear here..."
+    )
+    # Define the Gradio interface function for File Upload
+    def handle_file_upload(file_obj, output_format):
+        try:
+            flashcards = process_file(file_obj, output_format, language_model)
+            return flashcards
+        except ValueError as ve:
+            return str(ve)
+    # Define the Gradio interface function for Text Input
+    def handle_text_input(input_text, output_format):
+        try:
+            flashcards = process_text_input(input_text, output_format, language_model)
+            return flashcards
+        except ValueError as ve:
+            return str(ve)
+    # Create the Gradio Tabs
+    with gr.Blocks() as interface:
+        gr.Markdown("# Flashcard Extraction Tool")
+        gr.Markdown(
+            "Extract flashcards from uploaded files or directly input text. Choose your preferred output format."
+        )
+        with gr.Tab("Upload File"):
+            with gr.Row():
+                with gr.Column():
+                    file_input = gr.File(
+                        label="Upload a File",
+                        file_types=['.pdf', '.txt', '.md']
+                    )
+                    format_selector = gr.Radio(
+                        choices=["CSV", "JSON"],
+                        label="Select Output Format",
+                        value="JSON",
+                        type="value"
+                    )
+                    submit_file = gr.Button("Extract Flashcards")
+                with gr.Column():
+                    flashcard_output_file = gr.Textbox(
+                        label="Flashcards",
+                        lines=20,
+                        placeholder="Extracted flashcards will appear here..."
+                    )
+            submit_file.click(
+                fn=handle_file_upload,
+                inputs=[file_input, format_selector],
+                outputs=flashcard_output_file
+            )
+        with gr.Tab("Input Text"):
+            with gr.Row():
+                with gr.Column():
+                    text_input = gr.Textbox(
+                        label="Enter Text",
+                        lines=20,
+                        placeholder="Type or paste your text here..."
+                    )
+                    format_selector_text = gr.Radio(
+                        choices=["CSV", "JSON"],
+                        label="Select Output Format",
+                        value="JSON",
+                        type="value"
+                    )
+                    submit_text = gr.Button("Extract Flashcards")
+                with gr.Column():
+                    flashcard_output_text = gr.Textbox(
+                        label="Flashcards",
+                        lines=20,
+                        placeholder="Extracted flashcards will appear here..."
+                    )
+            submit_text.click(
+                fn=handle_text_input,
+                inputs=[text_input, format_selector_text],
+                outputs=flashcard_output_text
+            )
+        gr.Markdown(
+            """
+            ---
+            **Notes:**
+            - Supported file types: `.pdf`, `.txt`, `.md`.
+            - Ensure that the input text is clear and well-structured for optimal flashcard extraction.
+            """
+        )
+    return interface

app/models.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+class LanguageModel:
+    def __init__(self, model_name: str = "Qwen/Qwen2.5-7B-Instruct"):
+        self.device = self._determine_device()
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype="auto",
+            device_map="auto"
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def _determine_device(self):
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            return torch.device("cpu")
+    def generate_flashcards(self, prompt: str, max_new_tokens: int = 1024) -> str:
+        inputs = self.tokenizer(prompt, return_tensors='pt').to(self.model.device)
+        with torch.no_grad():
+            output_ids = self.model.generate(
+                inputs.input_ids,
+                max_new_tokens=max_new_tokens,
+                do_sample=True
+            )
+        response = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        return response

app/processing.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+import pymupdf4llm
+def process_pdf(pdf_path: str) -> str:
+    """
+    Extracts text from a PDF file using pymupdf4llm.
+    """
+    try:
+        text = pymupdf4llm.extract_text(pdf_path)
+        return text
+    except Exception as e:
+        raise ValueError(f"Error processing PDF: {str(e)}")
+def read_text_file(file_path: str) -> str:
+    """
+    Reads text from a .txt or .md file.
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            text = f.read()
+        return text
+    except Exception as e:
+        raise ValueError(f"Error reading text file: {str(e)}")
+def format_prompt(output_format: str) -> str:
+    """
+    Formats the prompt based on the output type.
+    """
+    if output_format.lower() == "json":
+        return """You only respond with cards in JSON format. Follow the example below.
+    EXAMPLE:
+    [
+        {"question": "What is AI?", "answer": "Artificial Intelligence."},
+        {"question": "What is ML?", "answer": "Machine Learning."}
+        ...
+    ]
+    """
+    elif output_format.lower() == "csv":
+        return """You only respond with cards in CSV format. Follow the example below.
+    EXAMPLE:
+        "What is AI?", "Artificial Intelligence."
+        "What is ML?", "Machine Learning."
+        ...
+    """
+def extract_flashcards(text: str, output_format: str, language_model: str) -> str:
+    """
+    Extracts flashcards from the input text using the LLM and formats them in CSV or JSON.
+    """
+    prompt = f"""You are an expert flashcard creator. You always include a single knowledge item per flashcard.
+    {format_prompt(output_format)}
+    Extract flashcards from the user's text:
+    {text}
+    Do not include the prompt or any other unnecessary information in the flashcards.
+    Do not include triple ticks (```) or any other code blocks in the flashcards.
+    """
+    # TODO:
+    # see https://qwen.readthedocs.io/en/latest/inference/chat.html
+    # e.g. pipeline = pipeline("text-generation", model="Qwen/Qwen2.5-7B-Instruct")
+    response = language_model.generate_flashcards(prompt)
+    return response
+def process_file(file_obj, output_format: str, language_model) -> str:
+    """
+    Processes the uploaded file based on its type and extracts flashcards.
+    """
+    file_path = file_obj.name
+    file_ext = os.path.splitext(file_path)[1].lower()
+    if file_ext == '.pdf':
+        text = process_pdf(file_path)
+    elif file_ext in ['.txt', '.md']:
+        text = read_text_file(file_path)
+    else:
+        raise ValueError("Unsupported file type.")
+    flashcards = extract_flashcards(text, output_format, language_model)
+    return flashcards
+def process_text_input(input_text: str, output_format: str, language_model) -> str:
+    """
+    Processes the input text and extracts flashcards.
+    """
+    if not input_text.strip():
+        raise ValueError("No text provided.")
+    flashcards = extract_flashcards(input_text, output_format, language_model)
+    return flashcards

environment.yml ADDED Viewed

	@@ -0,0 +1,19 @@

+name: flashcard-maker
+channels:
+  - conda-forge
+  - pytorch
+  - defaults
+dependencies:
+  - python=3.12
+  - torch
+  - torchvision
+  - torchaudio
+  - cudatoolkit=11.7  # Remove or adjust if installing CPU-only
+  - transformers
+  - gradio
+  - librosa
+  - pytest
+  - pytest-mock
+  - pip
+  - pip:
+    - pymupdf4llm

pytest.ini ADDED Viewed

	@@ -0,0 +1,5 @@

+# pytest.ini
+[pytest]
+filterwarnings =
+    ignore::DeprecationWarning

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+pytorch
+transformers
+gradio
+librosa
+pymupdf4llm
+pytest
+pytest-mock        # Added for mocking capabilities

tests/__init__.py ADDED Viewed

File without changes

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import pytest
+from unittest.mock import Mock
+from app.models import LanguageModel
+@pytest.fixture
+def language_model():
+    """
+    Fixture to provide a mocked LanguageModel instance.
+    """
+    # Create a mock instance of LanguageModel
+    lm = Mock(spec=LanguageModel)
+    # Mock the generate_flashcards method
+    lm.generate_flashcards.return_value = '{"flashcards": []}'
+    return lm

tests/test_models.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# tests/test_models.py
+import pytest
+def test_generate_flashcards(language_model, mocker):
+    """
+    Test the generate_flashcards method of LanguageModel.
+    """
+    prompt = "Sample prompt for flashcard generation."
+    expected_response = '{"flashcards": [{"Question": "What is AI?", "Answer": "Artificial Intelligence."}]}'
+    # Configure the mock to return a specific response
+    language_model.generate_flashcards.return_value = expected_response
+    # Call the method
+    response = language_model.generate_flashcards(prompt)
+    # Assertions
+    assert response == expected_response
+    language_model.generate_flashcards.assert_called_once_with(prompt)

tests/test_processing.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# tests/test_processing.py
+import pytest
+from app.processing import process_text_input, process_file
+def test_process_text_input_success(language_model):
+    """
+    Test processing of valid text input.
+    """
+    input_text = "This is a sample text for flashcard extraction."
+    output_format = "JSON"
+    expected_output = '{"flashcards": []}'
+    result = process_text_input(input_text, output_format, language_model)
+    assert result == expected_output
+    language_model.generate_flashcards.assert_called_once()
+def test_process_text_input_empty(language_model):
+    """
+    Test processing of empty text input.
+    """
+    input_text = "   "
+    output_format = "JSON"
+    with pytest.raises(ValueError) as excinfo:
+        process_text_input(input_text, output_format, language_model)
+    assert "No text provided." in str(excinfo.value)
+def test_process_file_unsupported_type(language_model, tmp_path):
+    """
+    Test processing of an unsupported file type.
+    """
+    # Create a dummy unsupported file
+    dummy_file = tmp_path / "dummy.unsupported"
+    dummy_file.write_text("Unsupported content")
+    with pytest.raises(ValueError) as excinfo:
+        process_file(dummy_file, "JSON", language_model)
+    assert "Unsupported file type." in str(excinfo.value)
+def test_process_file_pdf(language_model, tmp_path, mocker):
+    """
+    Test processing of a PDF file.
+    """
+    # Mock the process_pdf function
+    mocker.patch('app.processing.process_pdf', return_value="Extracted PDF text.")
+    # Create a dummy PDF file
+    dummy_file = tmp_path / "test.pdf"
+    dummy_file.write_text("PDF content")
+    expected_output = '{"flashcards": []}'
+    result = process_file(dummy_file, "JSON", language_model)
+    assert result == expected_output
+    language_model.generate_flashcards.assert_called_once()
+def test_process_file_txt(language_model, tmp_path, mocker):
+    """
+    Test processing of a TXT file.
+    """
+    # Mock the read_text_file function
+    mocker.patch('app.processing.read_text_file', return_value="Extracted TXT text.")
+    # Create a dummy TXT file
+    dummy_file = tmp_path / "test.txt"
+    dummy_file.write_text("TXT content")
+    expected_output = '{"flashcards": []}'
+    result = process_file(dummy_file, "JSON", language_model)
+    assert result == expected_output
+    language_model.generate_flashcards.assert_called_once()