Spaces:
Sleeping
Sleeping
# This project uses the BART model from Facebook AI Research (FAIR) available at https://huggingface.co/facebook/bart-large-cnn under the Apache License 2.0. | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import fitz # PyMuPDF | |
import gradio as gr | |
from transformers import pipeline | |
import re | |
# μμ½μ μν λͺ¨λΈ λ‘λ | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
def extract_text_from_pdf(pdf_path): | |
doc = fitz.open(pdf_path) | |
text = "" | |
for page_num in range(doc.page_count): | |
page = doc.load_page(page_num) | |
text += page.get_text("text") + "\n" | |
return text | |
def find_section(text, section_title): | |
# μ κ· ννμμ μ¬μ©νμ¬ μΉμ μ λͺ©μ μ°Ύμ΅λλ€. | |
pattern = re.compile(r'(?i)^.*{}.*$'.format(section_title), re.MULTILINE) | |
matches = list(pattern.finditer(text)) | |
if not matches: | |
return None | |
start_idx = matches[0].start() | |
end_idx = text.find('\n\n', start_idx) | |
if end_idx == -1: | |
end_idx = len(text) | |
section_text = text[start_idx:end_idx].strip() | |
return section_text | |
def summarize_section(text, section_title, max_length=150): | |
try: | |
section_text = find_section(text, section_title) | |
if section_text: | |
summary = summarizer(section_text, max_length=max_length, min_length=30, do_sample=False) | |
return summary[0]['summary_text'] | |
return f"Section '{section_title}' not found." | |
except Exception as e: | |
return f"Error processing section '{section_title}': {str(e)}" | |
def process_pdf(file): | |
try: | |
text = extract_text_from_pdf(file.name) | |
except Exception as e: | |
return [f"Error extracting text from PDF: {str(e)}"] * 3 | |
abstract_summary = summarize_section(text, "abstract") | |
research_question_summary = summarize_section(text, "research question") | |
results_summary = summarize_section(text, "results") | |
return [abstract_summary, research_question_summary, results_summary] | |
# Gradio μΈν°νμ΄μ€ μ€μ | |
interface = gr.Interface( | |
fn=process_pdf, | |
inputs=gr.File(label="Upload PDF"), | |
outputs=[ | |
gr.Textbox(label="Abstract Summary"), | |
gr.Textbox(label="Research Question Summary"), | |
gr.Textbox(label="Results Summary") | |
] | |
) | |
# μΈν°νμ΄μ€ μ€ν | |
interface.launch() | |