|
import streamlit as st |
|
import pytesseract |
|
from pdf2image import convert_from_path |
|
import os |
|
import re |
|
import json |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
import base64 |
|
import torch |
|
|
|
|
|
os.system("apt-get update && apt-get install -y tesseract-ocr tesseract-ocr-deu poppler-utils") |
|
|
|
|
|
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' |
|
|
|
|
|
encoded_token = "aGZfQ05kQ2NQclNIa0NYZ2FIU01JTG1zUHpPdk5qTWVUdGpJUg==" |
|
token = base64.b64decode(encoded_token).decode("utf-8") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("BabakBagheriGisour/pdf-suzammenfassen", use_auth_token=token) |
|
model = AutoModelForCausalLM.from_pretrained("BabakBagheriGisour/pdf-suzammenfassen", use_auth_token=token) |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model.to(device) |
|
|
|
|
|
|
|
def clean_text(text): |
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
text = re.sub(r'http\S+|www\.[\w.-]+', '', text) |
|
|
|
|
|
text = re.sub(r'[^\w\s.,ÄäÖöÜüß]', '', text) |
|
|
|
|
|
text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) |
|
|
|
|
|
text = text.strip() |
|
|
|
|
|
text = re.sub(r'\s([.,])', r'\1', text) |
|
|
|
|
|
text = re.sub(r'([.,])\s', r'\1', text) |
|
|
|
return text |
|
|
|
|
|
|
|
|
|
def extract_text_using_ocr(pdf_path): |
|
poppler_path = "/usr/bin" |
|
|
|
pages = convert_from_path(pdf_path, 300, poppler_path=poppler_path) |
|
|
|
all_text = [] |
|
for page in pages: |
|
|
|
text = pytesseract.image_to_string(page, lang="deu") |
|
all_text.append(text) |
|
|
|
|
|
return [clean_text(text) for text in all_text] |
|
|
|
|
|
|
|
|
|
def split_text_into_chunks(text, max_length=512): |
|
words = text.split() |
|
chunks = [] |
|
chunk = [] |
|
length = 0 |
|
|
|
for word in words: |
|
length += len(tokenizer.encode(word)) |
|
if length > max_length: |
|
chunks.append(" ".join(chunk)) |
|
chunk = [word] |
|
length = len(tokenizer.encode(word)) |
|
else: |
|
chunk.append(word) |
|
|
|
if chunk: |
|
chunks.append(" ".join(chunk)) |
|
|
|
return chunks |
|
|
|
|
|
def extract_first_sentence(text): |
|
return text.split(".")[0] + "." |
|
|
|
|
|
def summarize_text(text): |
|
chunks = split_text_into_chunks(text) |
|
|
|
summaries = [] |
|
for chunk in chunks: |
|
inputs = tokenizer(chunk, return_tensors="pt", max_length=25, truncation=True, padding=True).to(device) |
|
|
|
summary_ids = model.generate( |
|
inputs['input_ids'], |
|
num_beams=2, |
|
max_length=30, |
|
min_length=10, |
|
early_stopping=True |
|
) |
|
|
|
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) |
|
|
|
first_sentence = extract_first_sentence(summary) |
|
summaries.append(first_sentence) |
|
|
|
return " ".join(summaries) |
|
|
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.title { |
|
font-size: 30px; |
|
color: #4CAF50; |
|
text-align: center; |
|
font-weight: bold; |
|
} |
|
.info-text { |
|
font-size: 18px; |
|
color: #555555; |
|
} |
|
.summary-box { |
|
background-color: #f4f4f9; |
|
border-radius: 10px; |
|
padding: 10px; |
|
margin-bottom: 15px; |
|
} |
|
.download-btn { |
|
background-color: #cccccc; |
|
color: white; |
|
padding: 10px 20px; |
|
border-radius: 5px; |
|
text-align: center; |
|
font-size: 16px; |
|
display: inline-block; |
|
cursor: pointer; |
|
margin-top: 10px; |
|
} |
|
.download-btn:hover { |
|
background-color: #999999; |
|
} |
|
.download-text-btn { |
|
background-color: #cccccc; /* رنگ */ |
|
color: white; |
|
padding: 10px 20px; |
|
border-radius: 5px; |
|
text-align: center; |
|
font-size: 16px; |
|
display: inline-block; |
|
cursor: pointer; |
|
margin-top: 10px; |
|
} |
|
.download-text-btn:hover { |
|
background-color: #999999; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
st.markdown('<p class="title">PDF Datei Zusammenfassen</p>', unsafe_allow_html=True) |
|
uploaded_file = st.file_uploader("Bitte laden Sie eine PDF-Datei hoch", type="pdf") |
|
|
|
if uploaded_file: |
|
|
|
temp_file_path = f"temp_{uploaded_file.name}" |
|
with open(temp_file_path, "wb") as f: |
|
f.write(uploaded_file.read()) |
|
|
|
|
|
st.info("Das Modell verarbeitet den Text...") |
|
progress_bar = st.progress(0) |
|
pages_text = extract_text_using_ocr(temp_file_path) |
|
total_pages = len(pages_text) |
|
|
|
|
|
st.subheader("Zusammenfassungen:") |
|
summaries = [] |
|
for idx, page_text in enumerate(pages_text): |
|
summary = summarize_text(page_text) |
|
summaries.append({ |
|
"text": page_text, |
|
"zusammenfassen": summary |
|
}) |
|
st.write(f"**Seite {idx + 1}:**") |
|
st.text_area(f"Seite {idx + 1} - Text", page_text, height=200, key=f"text_{idx}") |
|
st.text_area("Zusammenfassung", summary, height=100, key=f"summary_{idx}") |
|
progress_bar.progress((idx + 1) / total_pages) |
|
|
|
|
|
output_file = f"{uploaded_file.name.split('.')[0]}.jsonl" |
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
for item in summaries: |
|
f.write(json.dumps(item, ensure_ascii=False) + '\n') |
|
|
|
st.success("Verarbeitung abgeschlossen!") |
|
|
|
output_jsonl = f'<a href="data:file/json;base64,{base64.b64encode(open(output_file, "rb").read()).decode()}" class="download-btn" download="{output_file}">Download JSONL</a>' |
|
|
|
|
|
full_summary = "\n\n".join([item["zusammenfassen"] for item in summaries]) |
|
output_summary = f'<a href="data:text/plain;base64,{base64.b64encode(full_summary.encode()).decode()}" class="download-text-btn" download="{uploaded_file.name.split(".")[0]}_full_summary.txt">Download Full Summary</a>' |
|
|
|
|
|
st.markdown( |
|
f""" |
|
<div style="display: flex; gap: 10px; justify-content: center;"> |
|
{output_jsonl} |
|
{output_summary} |
|
</div> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
os.remove(temp_file_path) |
|
os.remove(output_file) |
|
|