plaggy commited on
Commit
7ddbf9d
1 Parent(s): a5c096a

req added, changed spacy to recursive

Browse files
Files changed (2) hide show
  1. app.py +6 -6
  2. requirements.txt +4 -0
app.py CHANGED
@@ -9,7 +9,7 @@ import requests
9
  import logging
10
 
11
  from aiohttp import ClientSession
12
- from langchain.text_splitter import SpacyTextSplitter
13
  from datasets import Dataset, load_dataset
14
  from tqdm import tqdm
15
  from tqdm.asyncio import tqdm_asyncio
@@ -26,8 +26,8 @@ class Chunker:
26
  def __init__(self, strategy, split_seq=".", chunk_len=512):
27
  self.split_seq = split_seq
28
  self.chunk_len = chunk_len
29
- if strategy == "spacy":
30
- self.split = SpacyTextSplitter().split_text
31
  if strategy == "sequence":
32
  self.split = self.seq_splitter
33
  if strategy == "constant":
@@ -138,7 +138,7 @@ def run_embed(input_ds, input_splits, embed_in_text_col, output_ds, tei_url, pri
138
 
139
 
140
  def change_dropdown(choice):
141
- if choice == "spacy" or choice == "sequence":
142
  return [
143
  gr.Textbox(visible=True),
144
  gr.Textbox(visible=False)
@@ -166,8 +166,8 @@ with gr.Blocks() as demo:
166
  chunk_private = gr.Checkbox(label="Make chunked dataset private")
167
  with gr.Row():
168
  dropdown = gr.Dropdown(
169
- ["spacy", "sequence", "constant"], label="Chunking strategy",
170
- info="'spacy' uses a Spacy tokenizer, 'sequence' splits texts by a chosen sequence, "
171
  "'constant' makes chunks of the constant size",
172
  scale=2
173
  )
 
9
  import logging
10
 
11
  from aiohttp import ClientSession
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  from datasets import Dataset, load_dataset
14
  from tqdm import tqdm
15
  from tqdm.asyncio import tqdm_asyncio
 
26
  def __init__(self, strategy, split_seq=".", chunk_len=512):
27
  self.split_seq = split_seq
28
  self.chunk_len = chunk_len
29
+ if strategy == "recursive":
30
+ self.split = RecursiveCharacterTextSplitter().split_text
31
  if strategy == "sequence":
32
  self.split = self.seq_splitter
33
  if strategy == "constant":
 
138
 
139
 
140
  def change_dropdown(choice):
141
+ if choice == "recursive" or choice == "sequence":
142
  return [
143
  gr.Textbox(visible=True),
144
  gr.Textbox(visible=False)
 
166
  chunk_private = gr.Checkbox(label="Make chunked dataset private")
167
  with gr.Row():
168
  dropdown = gr.Dropdown(
169
+ ["recursive", "sequence", "constant"], label="Chunking strategy",
170
+ info="'recursive' uses a Langchain recursive tokenizer, 'sequence' splits texts by a chosen sequence, "
171
  "'constant' makes chunks of the constant size",
172
  scale=2
173
  )
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ langchain==0.0.*
2
+ aiohttp==3.8.*
3
+ datasets==2.16.*
4
+ numpy==1.25.*