Spaces:
Sleeping
Sleeping
req added, changed spacy to recursive
Browse files- app.py +6 -6
- requirements.txt +4 -0
app.py
CHANGED
@@ -9,7 +9,7 @@ import requests
|
|
9 |
import logging
|
10 |
|
11 |
from aiohttp import ClientSession
|
12 |
-
from langchain.text_splitter import
|
13 |
from datasets import Dataset, load_dataset
|
14 |
from tqdm import tqdm
|
15 |
from tqdm.asyncio import tqdm_asyncio
|
@@ -26,8 +26,8 @@ class Chunker:
|
|
26 |
def __init__(self, strategy, split_seq=".", chunk_len=512):
|
27 |
self.split_seq = split_seq
|
28 |
self.chunk_len = chunk_len
|
29 |
-
if strategy == "
|
30 |
-
self.split =
|
31 |
if strategy == "sequence":
|
32 |
self.split = self.seq_splitter
|
33 |
if strategy == "constant":
|
@@ -138,7 +138,7 @@ def run_embed(input_ds, input_splits, embed_in_text_col, output_ds, tei_url, pri
|
|
138 |
|
139 |
|
140 |
def change_dropdown(choice):
|
141 |
-
if choice == "
|
142 |
return [
|
143 |
gr.Textbox(visible=True),
|
144 |
gr.Textbox(visible=False)
|
@@ -166,8 +166,8 @@ with gr.Blocks() as demo:
|
|
166 |
chunk_private = gr.Checkbox(label="Make chunked dataset private")
|
167 |
with gr.Row():
|
168 |
dropdown = gr.Dropdown(
|
169 |
-
["
|
170 |
-
info="'
|
171 |
"'constant' makes chunks of the constant size",
|
172 |
scale=2
|
173 |
)
|
|
|
9 |
import logging
|
10 |
|
11 |
from aiohttp import ClientSession
|
12 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
from datasets import Dataset, load_dataset
|
14 |
from tqdm import tqdm
|
15 |
from tqdm.asyncio import tqdm_asyncio
|
|
|
26 |
def __init__(self, strategy, split_seq=".", chunk_len=512):
|
27 |
self.split_seq = split_seq
|
28 |
self.chunk_len = chunk_len
|
29 |
+
if strategy == "recursive":
|
30 |
+
self.split = RecursiveCharacterTextSplitter().split_text
|
31 |
if strategy == "sequence":
|
32 |
self.split = self.seq_splitter
|
33 |
if strategy == "constant":
|
|
|
138 |
|
139 |
|
140 |
def change_dropdown(choice):
|
141 |
+
if choice == "recursive" or choice == "sequence":
|
142 |
return [
|
143 |
gr.Textbox(visible=True),
|
144 |
gr.Textbox(visible=False)
|
|
|
166 |
chunk_private = gr.Checkbox(label="Make chunked dataset private")
|
167 |
with gr.Row():
|
168 |
dropdown = gr.Dropdown(
|
169 |
+
["recursive", "sequence", "constant"], label="Chunking strategy",
|
170 |
+
info="'recursive' uses a Langchain recursive tokenizer, 'sequence' splits texts by a chosen sequence, "
|
171 |
"'constant' makes chunks of the constant size",
|
172 |
scale=2
|
173 |
)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain==0.0.*
|
2 |
+
aiohttp==3.8.*
|
3 |
+
datasets==2.16.*
|
4 |
+
numpy==1.25.*
|