Almaatla commited on
Commit
6cfb094
1 Parent(s): 108ada8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -6
app.py CHANGED
@@ -10,27 +10,71 @@ def extract_text_from_pdf(file_path):
10
  text += pdf.getPage(page_num).extractText()
11
  return text
12
 
13
- def count_tokens(text):
14
- tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
15
  tokens = tokenizer.encode(
16
  text,
17
  disallowed_special=()
18
  )
19
- return len(tokens)
 
 
 
20
 
21
  def count_tokens_in_file(file):
22
  # Extract text from the PDF file
23
  paper_text = extract_text_from_pdf(file.name)
24
  return count_tokens(paper_text)
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  with gr.Blocks() as demo:
27
  gr.Markdown("Upload your document to count their tokens")
28
- with gr.Tab("Upload PDF & TXT"):
29
  docs_input = gr.File(file_count="single", file_types=[".pdf"])
30
  tb_tokenCount = gr.Textbox(label='Number of tokens')
31
  docs_input.upload(count_tokens_in_file,inputs=[docs_input],outputs=[tb_tokenCount])
32
- btn_count = gr.Button("Count token")
33
- btn_count.click(count_tokens_in_file,inputs=[docs_input],outputs=[tb_tokenCount])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  #demo.queue()
36
  demo.launch(debug=True,share=False)
 
10
  text += pdf.getPage(page_num).extractText()
11
  return text
12
 
13
+ def tokenize(text,model="gpt-3.5-turbo"):
14
+ tokenizer = tiktoken.encoding_for_model(model)
15
  tokens = tokenizer.encode(
16
  text,
17
  disallowed_special=()
18
  )
19
+ return tokens
20
+
21
+ def count_tokens(text):
22
+ return len(tokenize(text))
23
 
24
  def count_tokens_in_file(file):
25
  # Extract text from the PDF file
26
  paper_text = extract_text_from_pdf(file.name)
27
  return count_tokens(paper_text)
28
 
29
+ def chunk_text(text, max_char, overlap):
30
+ chunks = []
31
+ start = 0
32
+ end = max_char
33
+ print(f"max char: {max_char}")
34
+ while start < len(text):
35
+ if end >= len(text):
36
+ end = len(text)
37
+ chunk = text[start:end]
38
+ print(f"chunk[{start}:{end}] size: {count_tokens(chunk)} tokens")
39
+ chunks.append(chunk)
40
+ start += max_char - overlap
41
+ end = start + max_char
42
+ return chunks
43
+
44
+ def chunk_file(file, max_char,overlap):
45
+ # Extract text from the PDF file
46
+ text = extract_text_from_pdf(file.name)
47
+ chunks = chunk_text(text, max_char, overlap)
48
+
49
+ return '\n\n[xxxxxxxxxxxxxxxxx]\n\n'.join(chunks)
50
+
51
  with gr.Blocks() as demo:
52
  gr.Markdown("Upload your document to count their tokens")
53
+ with gr.Tab("Upload PDF"):
54
  docs_input = gr.File(file_count="single", file_types=[".pdf"])
55
  tb_tokenCount = gr.Textbox(label='Number of tokens')
56
  docs_input.upload(count_tokens_in_file,inputs=[docs_input],outputs=[tb_tokenCount])
57
+ sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk")
58
+ sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")
59
+
60
+ btn_chunk = gr.Button("Chunk text")
61
+ tb_chunked_text = gr.Textbox(label='Result')
62
+
63
+ btn_chunk.click(chunk_file,inputs=[docs_input,sl_max_char_per_chunk,sl_overlap],outputs=[tb_chunked_text])
64
+ with gr.Tab("Text"):
65
+ text_input = gr.Textbox(label='Insert your text here')
66
+ text_tb_tokenCount = gr.Textbox(label='Number of tokens')
67
+ text_input.change(count_tokens,inputs=[text_input],outputs=[text_tb_tokenCount])
68
+ text_sl_max_char_per_chunk = gr.Slider(1000, 30000, value=2000, label="Number of characters", info="Choose a number of characters per chunk")
69
+ text_sl_overlap = gr.Slider(0, 20000, value=400, label="Overlap", info="Choose overlap size")
70
+
71
+ text_btn_chunk = gr.Button("Chunk text")
72
+ text_tb_chunked_text = gr.Textbox(label='Result')
73
+ def format_chunks(text,max_char,overlap):
74
+ return '\n\n[xxxxxxxxxxxxxxxx]\n\n'.join(chunk_text(text,max_char,overlap))
75
+ text_btn_chunk.click(format_chunks,
76
+ inputs=[text_input,text_sl_max_char_per_chunk,text_sl_overlap],
77
+ outputs=[text_tb_chunked_text])
78
 
79
  #demo.queue()
80
  demo.launch(debug=True,share=False)