File size: 3,744 Bytes
14a6f5b
 
c96166d
9f38a4d
 
c96166d
 
 
 
 
 
 
 
 
14a6f5b
6cfb094
14a6f5b
6cfb094
 
 
 
c96166d
14a6f5b
 
 
 
 
 
 
 
 
 
 
 
c96166d
14a6f5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c96166d
6cfb094
 
 
 
 
 
 
 
14a6f5b
 
6cfb094
 
 
 
14a6f5b
6cfb094
 
14a6f5b
 
 
6cfb094
14a6f5b
 
 
 
 
6cfb094
c96166d
14a6f5b
 
 
 
 
 
 
 
6cfb094
14a6f5b
 
 
6cfb094
14a6f5b
fe257d3
14a6f5b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
import zipfile
import gradio as gr
from PyPDF4 import PdfFileReader
import tiktoken

def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf = PdfFileReader(file)
        text = ""
        for page_num in range(pdf.getNumPages()):
            text += pdf.getPage(page_num).extractText()
    return text

def tokenize(text, model="gpt-3.5-turbo"):
    tokenizer = tiktoken.encoding_for_model(model)
    tokens = tokenizer.encode(text, disallowed_special=())
    return tokens

def count_tokens(text):
    return len(tokenize(text))

def analyse_text(text):
    num_tokens = count_tokens(text)
    result = []
    try:
        result.append(f"Text length: {len(text)}")
        result.append(f"Token counts: {num_tokens}")
        result.append(f"Char per token: {'%.1f' % (len(text)/num_tokens)}")
    except:
        result = 'no text'
    return '\n'.join(result)

def analyse_file(file):
    paper_text = extract_text_from_pdf(file.name)
    return paper_text

def write_chunks_to_files(chunks):
    file_paths = []
    for i, chunk in enumerate(chunks, start=1):
        file_path = f"chunk_{i}.txt"
        with open(file_path, "w") as file:
            file.write(chunk)
        file_paths.append(file_path)
    return file_paths

def write_chunks_to_zip(chunks):
    file_paths = write_chunks_to_files(chunks)
    zip_file_name = "chunks.zip"
    with zipfile.ZipFile(zip_file_name, 'w') as zipf:
        for file in file_paths:
            zipf.write(file)
            os.remove(file)  # Remove the file after writing it into the zip
    return zip_file_name

def chunk_text(text, max_char, overlap):
    chunks = []
    start = 0
    end = max_char
    while start < len(text):
        if end >= len(text):
            end = len(text)
        chunk = text[start:end]
        num_tokens = count_tokens(chunk)
        chunks.append((chunk, len(chunk), num_tokens))
        start += max_char - overlap
        end = start + max_char
    return chunks

def chunk_file(file, max_char, overlap):
    text = extract_text_from_pdf(file.name)
    chunks = chunk_text(text, max_char, overlap)
    formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)]
    zip_file_path = write_chunks_to_zip([c[0] for c in chunks])
    return '\n'.join(formatted_chunks), zip_file_path

def chunk_and_zip_text(text, max_char, overlap):
    chunks = chunk_text(text, max_char, overlap)
    formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)]
    zip_file_path = write_chunks_to_zip([c[0] for c in chunks])
    return '\n'.join(formatted_chunks), zip_file_path

with gr.Blocks() as demo:
    docs_input = gr.File(file_count="single", file_types=[".pdf"])
    text_to_chunk = gr.Textbox(label='Text to chunk',show_copy_button=True)
    tb_analysis = gr.Textbox(label='Text Analysis')
    sl_max_char_per_chunk = gr.Slider(1000, 300000, value=10000, label="Number of characters", info="Choose a number of characters per chunk")
    sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")
    btn_chunk = gr.Button("Chunk text")
    tb_chunked_text = gr.Textbox(label='Chunks Info')
    download_link = gr.File(label='Download Chunks')

    # Call analyse_file when a file is uploaded and display the results in tb_analysis
    docs_input.upload(analyse_file,inputs=[docs_input], outputs=[text_to_chunk])
    text_to_chunk.change(analyse_text,inputs=[text_to_chunk],outputs=[tb_analysis])

    btn_chunk.click(chunk_and_zip_text, inputs=[text_to_chunk, sl_max_char_per_chunk, sl_overlap], outputs=[tb_chunked_text, download_link])

demo.launch(debug=True, share=False)