Almaatla commited on
Commit
14a6f5b
1 Parent(s): 6cfb094

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -40
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import gradio as gr
2
  from PyPDF4 import PdfFileReader
3
  import tiktoken
@@ -10,71 +12,88 @@ def extract_text_from_pdf(file_path):
10
  text += pdf.getPage(page_num).extractText()
11
  return text
12
 
13
- def tokenize(text,model="gpt-3.5-turbo"):
14
  tokenizer = tiktoken.encoding_for_model(model)
15
- tokens = tokenizer.encode(
16
- text,
17
- disallowed_special=()
18
- )
19
  return tokens
20
 
21
  def count_tokens(text):
22
  return len(tokenize(text))
23
 
24
- def count_tokens_in_file(file):
25
- # Extract text from the PDF file
 
 
 
 
 
 
 
 
 
 
26
  paper_text = extract_text_from_pdf(file.name)
27
- return count_tokens(paper_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def chunk_text(text, max_char, overlap):
30
  chunks = []
31
  start = 0
32
  end = max_char
33
- print(f"max char: {max_char}")
34
  while start < len(text):
35
  if end >= len(text):
36
  end = len(text)
37
  chunk = text[start:end]
38
- print(f"chunk[{start}:{end}] size: {count_tokens(chunk)} tokens")
39
- chunks.append(chunk)
40
  start += max_char - overlap
41
  end = start + max_char
42
  return chunks
43
 
44
- def chunk_file(file, max_char,overlap):
45
- # Extract text from the PDF file
46
  text = extract_text_from_pdf(file.name)
47
  chunks = chunk_text(text, max_char, overlap)
 
 
 
48
 
49
- return '\n\n[xxxxxxxxxxxxxxxxx]\n\n'.join(chunks)
 
 
 
 
50
 
51
  with gr.Blocks() as demo:
52
- gr.Markdown("Upload your document to count their tokens")
53
- with gr.Tab("Upload PDF"):
54
- docs_input = gr.File(file_count="single", file_types=[".pdf"])
55
- tb_tokenCount = gr.Textbox(label='Number of tokens')
56
- docs_input.upload(count_tokens_in_file,inputs=[docs_input],outputs=[tb_tokenCount])
57
- sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk")
58
- sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")
 
59
 
60
- btn_chunk = gr.Button("Chunk text")
61
- tb_chunked_text = gr.Textbox(label='Result')
62
-
63
- btn_chunk.click(chunk_file,inputs=[docs_input,sl_max_char_per_chunk,sl_overlap],outputs=[tb_chunked_text])
64
- with gr.Tab("Text"):
65
- text_input = gr.Textbox(label='Insert your text here')
66
- text_tb_tokenCount = gr.Textbox(label='Number of tokens')
67
- text_input.change(count_tokens,inputs=[text_input],outputs=[text_tb_tokenCount])
68
- text_sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk")
69
- text_sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")
70
 
71
- text_btn_chunk = gr.Button("Chunk text")
72
- text_tb_chunked_text = gr.Textbox(label='Result')
73
- def format_chunks(text,max_char,overlap):
74
- return '\n\n[xxxxxxxxxxxxxxxx]\n\n'.join(chunk_text(text,max_char,overlap))
75
- text_btn_chunk.click(format_chunks,
76
- inputs=[text_input,text_sl_max_char_per_chunk,text_sl_overlap],
77
- outputs=[text_tb_chunked_text])
78
 
79
- #demo.queue()
80
- demo.launch(debug=True,share=False)
 
1
+ import os
2
+ import zipfile
3
  import gradio as gr
4
  from PyPDF4 import PdfFileReader
5
  import tiktoken
 
12
  text += pdf.getPage(page_num).extractText()
13
  return text
14
 
15
+ def tokenize(text, model="gpt-3.5-turbo"):
16
  tokenizer = tiktoken.encoding_for_model(model)
17
+ tokens = tokenizer.encode(text, disallowed_special=())
 
 
 
18
  return tokens
19
 
20
  def count_tokens(text):
21
  return len(tokenize(text))
22
 
23
+ def analyse_text(text):
24
+ num_tokens = count_tokens(text)
25
+ result = []
26
+ try:
27
+ result.append(f"Text length: {len(text)}")
28
+ result.append(f"Token counts: {num_tokens}")
29
+ result.append(f"Char per token: {'%.1f' % (len(text)/num_tokens)}")
30
+ except:
31
+ result = 'no text'
32
+ return '\n'.join(result)
33
+
34
+ def analyse_file(file):
35
  paper_text = extract_text_from_pdf(file.name)
36
+ return paper_text
37
+
38
+ def write_chunks_to_files(chunks):
39
+ file_paths = []
40
+ for i, chunk in enumerate(chunks, start=1):
41
+ file_path = f"chunk_{i}.txt"
42
+ with open(file_path, "w") as file:
43
+ file.write(chunk)
44
+ file_paths.append(file_path)
45
+ return file_paths
46
+
47
+ def write_chunks_to_zip(chunks):
48
+ file_paths = write_chunks_to_files(chunks)
49
+ zip_file_name = "chunks.zip"
50
+ with zipfile.ZipFile(zip_file_name, 'w') as zipf:
51
+ for file in file_paths:
52
+ zipf.write(file)
53
+ os.remove(file) # Remove the file after writing it into the zip
54
+ return zip_file_name
55
 
56
  def chunk_text(text, max_char, overlap):
57
  chunks = []
58
  start = 0
59
  end = max_char
 
60
  while start < len(text):
61
  if end >= len(text):
62
  end = len(text)
63
  chunk = text[start:end]
64
+ num_tokens = count_tokens(chunk)
65
+ chunks.append((chunk, len(chunk), num_tokens))
66
  start += max_char - overlap
67
  end = start + max_char
68
  return chunks
69
 
70
+ def chunk_file(file, max_char, overlap):
 
71
  text = extract_text_from_pdf(file.name)
72
  chunks = chunk_text(text, max_char, overlap)
73
+ formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)]
74
+ zip_file_path = write_chunks_to_zip([c[0] for c in chunks])
75
+ return '\n'.join(formatted_chunks), zip_file_path
76
 
77
+ def chunk_and_zip_text(text, max_char, overlap):
78
+ chunks = chunk_text(text, max_char, overlap)
79
+ formatted_chunks = [f"Chunk[{i}]: Size: {len(c[0])} chars, {c[2]} tokens" for i, c in enumerate(chunks, start=1)]
80
+ zip_file_path = write_chunks_to_zip([c[0] for c in chunks])
81
+ return '\n'.join(formatted_chunks), zip_file_path
82
 
83
  with gr.Blocks() as demo:
84
+ docs_input = gr.File(file_count="single", file_types=[".pdf"])
85
+ text_to_chunk = gr.Textbox(label='Text to chunk',show_copy_button=True)
86
+ tb_analysis = gr.Textbox(label='Text Analysis')
87
+ sl_max_char_per_chunk = gr.Slider(1000, 300000, value=10000, label="Number of characters", info="Choose a number of characters per chunk")
88
+ sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")
89
+ btn_chunk = gr.Button("Chunk text")
90
+ tb_chunked_text = gr.Textbox(label='Chunks Info')
91
+ download_link = gr.File(label='Download Chunks')
92
 
93
+ # Call analyse_file when a file is uploaded and display the results in tb_analysis
94
+ docs_input.upload(analyse_file,inputs=[docs_input], outputs=[text_to_chunk])
95
+ text_to_chunk.change(analyse_text,inputs=[text_to_chunk],outputs=[tb_analysis])
 
 
 
 
 
 
 
96
 
97
+ btn_chunk.click(chunk_and_zip_text, inputs=[text_to_chunk, sl_max_char_per_chunk, sl_overlap], outputs=[tb_chunked_text, download_link])
 
 
 
 
 
 
98
 
99
+ demo.launch(debug=True, share=False)