Spaces:

sergeipetrov
/

chunk-embed

Sleeping

App Files Files Community

plaggy commited on Feb 21

Commit

8134cf8

•

1 Parent(s): 7ddbf9d

refactor, a single process

Browse files

Files changed (1) hide show

app.py +97 -102

app.py CHANGED Viewed

@@ -27,7 +27,11 @@ class Chunker:
         self.split_seq = split_seq
         self.chunk_len = chunk_len
         if strategy == "recursive":
-            self.split = RecursiveCharacterTextSplitter().split_text
         if strategy == "sequence":
             self.split = self.seq_splitter
         if strategy == "constant":
@@ -51,26 +55,6 @@ def generator(input_ds, input_text_col, chunker):
                 yield {input_text_col: chunk}
-def chunk(input_ds, input_splits, input_text_col, output_ds, strategy, split_seq, chunk_len, private):
-    input_splits = [spl.strip() for spl in input_splits.split(",") if spl]
-    input_ds = load_dataset(input_ds, split="+".join(input_splits))
-    chunker = Chunker(strategy, split_seq, chunk_len)
-    gen_kwargs = {
-        "input_ds": input_ds,
-        "input_text_col": input_text_col,
-        "chunker": chunker
-    }
-    dataset = Dataset.from_generator(generator, gen_kwargs=gen_kwargs)
-    dataset.push_to_hub(
-        output_ds,
-        private=private,
-        token=HF_TOKEN
-    )
-    logger.info("Done chunking")
 async def embed_sent(sentence, embed_in_text_col, semaphore, tei_url, tmp_file):
     async with semaphore:
         payload = {
@@ -108,6 +92,7 @@ async def embed_ds(input_ds, tei_url, embed_in_text_col, temp_file):
 def wake_up_endpoint(url):
     n_loop = 0
     while requests.get(
         url=url,
@@ -115,30 +100,61 @@ def wake_up_endpoint(url):
     ).status_code != 200:
         time.sleep(2)
         n_loop += 1
-        if n_loop > 30:
-            raise TimeoutError("TEI endpoint is unavailable")
     logger.info("TEI endpoint is up")
-def run_embed(input_ds, input_splits, embed_in_text_col, output_ds, tei_url, private):
-    wake_up_endpoint(tei_url)
-    input_splits = [spl.strip() for spl in input_splits.split(",") if spl]
-    input_ds = load_dataset(input_ds, split="+".join(input_splits))
-    with tempfile.NamedTemporaryFile(mode="a", suffix=".jsonl") as temp_file:
-        asyncio.run(embed_ds(input_ds, tei_url, embed_in_text_col, temp_file))
-        dataset = Dataset.from_json(temp_file.name)
-        dataset.push_to_hub(
-            output_ds,
-            private=private,
-            token=HF_TOKEN
-        )
     logger.info("Done embedding")
 def change_dropdown(choice):
-    if choice == "recursive" or choice == "sequence":
         return [
             gr.Textbox(visible=True),
             gr.Textbox(visible=False)
@@ -153,73 +169,52 @@ def change_dropdown(choice):
 with gr.Blocks() as demo:
     gr.Markdown(
         """
-        ## Chunk your dataset before embedding
         """
     )
-    with gr.Tab("Chunk"):
-        chunk_in_ds = gr.Textbox(lines=1, label="Input dataset name")
-        with gr.Row():
-            chunk_in_splits = gr.Textbox(lines=1, label="Input dataset splits", placeholder="train, test")
-            chunk_in_text_col = gr.Textbox(lines=1, label="Input text column name", placeholder="text")
-        with gr.Row():
-            chunk_out_ds = gr.Textbox(lines=1, label="Output dataset name", scale=6)
-            chunk_private = gr.Checkbox(label="Make chunked dataset private")
-        with gr.Row():
-            dropdown = gr.Dropdown(
-                ["recursive", "sequence", "constant"], label="Chunking strategy",
-                info="'recursive' uses a Langchain recursive tokenizer, 'sequence' splits texts by a chosen sequence, "
-                     "'constant' makes chunks of the constant size",
-                scale=2
-            )
-            split_seq = gr.Textbox(
-                lines=1,
-                interactive=True,
-                visible=False,
-                label="Sequence",
-                info="A text sequence to split on",
-                placeholder="\n\n"
-            )
-            chunk_len = gr.Textbox(
-                lines=1,
-                interactive=True,
-                visible=False,
-                label="Length",
-                info="The length of chunks to split into",
-                placeholder="512"
-            )
-            dropdown.change(fn=change_dropdown, inputs=dropdown, outputs=[split_seq, chunk_len])
-        with gr.Row():
-            gr.ClearButton(
-                components=[
-                    chunk_in_ds, chunk_in_splits, chunk_in_text_col, chunk_out_ds,
-                    dropdown, split_seq, chunk_len, chunk_private
-                ]
-            )
-            chunk_btn = gr.Button("Chunk")
-            chunk_btn.click(
-                fn=chunk,
-                inputs=[chunk_in_ds, chunk_in_splits, chunk_in_text_col, chunk_out_ds,
-                        dropdown, split_seq, chunk_len, chunk_private]
-            )
-    with gr.Tab("Embed"):
-        embed_in_ds = gr.Textbox(lines=1, label="Input dataset name")
-        with gr.Row():
-            embed_in_splits = gr.Textbox(lines=1, label="Input dataset splits", placeholder="train, test")
-            embed_in_text_col = gr.Textbox(lines=1, label="Input text column name", placeholder="text")
-        with gr.Row():
-            embed_out_ds = gr.Textbox(lines=1, label="Output dataset name", scale=6)
-            embed_private = gr.Checkbox(label="Make embedded dataset private")
-        tei_url = gr.Textbox(lines=1, label="TEI endpoint url")
-        with gr.Row():
-            gr.ClearButton(
-                components=[embed_in_ds, embed_in_splits, embed_in_text_col, embed_out_ds, tei_url, embed_private]
-            )
-            embed_btn = gr.Button("Run embed")
-            embed_btn.click(
-                fn=run_embed,
-                inputs=[embed_in_ds, embed_in_splits, embed_in_text_col, embed_out_ds, tei_url, embed_private]
-            )
 demo.launch(debug=True)

         self.split_seq = split_seq
         self.chunk_len = chunk_len
         if strategy == "recursive":
+            # https://huggingface.co/spaces/m-ric/chunk_visualizer
+            self.split = RecursiveCharacterTextSplitter(
+                chunk_size=chunk_len,
+                separators=[split_seq]
+            ).split_text
         if strategy == "sequence":
             self.split = self.seq_splitter
         if strategy == "constant":
                 yield {input_text_col: chunk}
 async def embed_sent(sentence, embed_in_text_col, semaphore, tei_url, tmp_file):
     async with semaphore:
         payload = {
 def wake_up_endpoint(url):
+    logger.info("Starting up TEI endpoint")
     n_loop = 0
     while requests.get(
         url=url,
     ).status_code != 200:
         time.sleep(2)
         n_loop += 1
+        if n_loop > 40:
+            raise gr.Error("TEI endpoint is unavailable")
     logger.info("TEI endpoint is up")
+def chunk_embed(input_ds, input_splits, input_text_col, chunk_out_ds,
+                strategy, split_seq, chunk_len, embed_out_ds, tei_url, private):
+    gr.Info("Started chunking")
+    try:
+        input_splits = [spl.strip() for spl in input_splits.split(",") if spl]
+        input_ds = load_dataset(input_ds, split="+".join(input_splits), token=HF_TOKEN)
+        chunker = Chunker(strategy, split_seq, chunk_len)
+    except Exception as e:
+        raise gr.Error(str(e))
+    gen_kwargs = {
+        "input_ds": input_ds,
+        "input_text_col": input_text_col,
+        "chunker": chunker
+    }
+    chunked_ds = Dataset.from_generator(generator, gen_kwargs=gen_kwargs)
+    chunked_ds.push_to_hub(
+        chunk_out_ds,
+        private=private,
+        token=HF_TOKEN
+    )
+    gr.Info("Done chunking")
+    logger.info("Done chunking")
+    try:
+        wake_up_endpoint(tei_url)
+        with tempfile.NamedTemporaryFile(mode="a", suffix=".jsonl") as temp_file:
+            asyncio.run(embed_ds(chunked_ds, tei_url, input_text_col, temp_file))
+            embedded_ds = Dataset.from_json(temp_file.name)
+            embedded_ds.push_to_hub(
+                embed_out_ds,
+                private=private,
+                token=HF_TOKEN
+            )
+    except Exception as e:
+        raise gr.Error(str(e))
+    gr.Info("Done embedding")
     logger.info("Done embedding")
 def change_dropdown(choice):
+    if choice == "recursive":
+        return [
+            gr.Textbox(visible=True),
+            gr.Textbox(visible=True)
+        ]
+    elif choice == "sequence":
         return [
             gr.Textbox(visible=True),
             gr.Textbox(visible=False)
 with gr.Blocks() as demo:
     gr.Markdown(
         """
+        ## Chunk and embed
         """
     )
+    input_ds = gr.Textbox(lines=1, label="Input dataset name")
+    with gr.Row():
+        input_splits = gr.Textbox(lines=1, label="Input dataset splits", placeholder="train, test")
+        input_text_col = gr.Textbox(lines=1, label="Input text column name", placeholder="text")
+    chunk_out_ds = gr.Textbox(lines=1, label="Chunked dataset name")
+    with gr.Row():
+        dropdown = gr.Dropdown(
+            ["recursive", "sequence", "constant"], label="Chunking strategy",
+            info="'recursive' uses a Langchain recursive tokenizer, 'sequence' splits texts by a chosen sequence, "
+                 "'constant' makes chunks of the constant size",
+            scale=2
+        )
+        split_seq = gr.Textbox(
+            lines=1,
+            interactive=True,
+            visible=False,
+            label="Sequence",
+            info="A text sequence to split on",
+            placeholder="\n\n"
+        )
+        chunk_len = gr.Textbox(
+            lines=1,
+            interactive=True,
+            visible=False,
+            label="Length",
+            info="The length of chunks to split into in characters",
+            placeholder="512"
+        )
+        dropdown.change(fn=change_dropdown, inputs=dropdown, outputs=[split_seq, chunk_len])
+    embed_out_ds = gr.Textbox(lines=1, label="Embedded dataset name")
+    private = gr.Checkbox(label="Make output datasets private")
+    tei_url = gr.Textbox(lines=1, label="TEI endpoint url")
+    with gr.Row():
+        clear = gr.ClearButton(
+            components=[input_ds, input_splits, input_text_col, chunk_out_ds,
+                dropdown, split_seq, chunk_len, embed_out_ds, tei_url, private]
+        )
+        embed_btn = gr.Button("Submit")
+        embed_btn.click(
+            fn=chunk_embed,
+            inputs=[input_ds, input_splits, input_text_col, chunk_out_ds,
+                dropdown, split_seq, chunk_len, embed_out_ds, tei_url, private]
+        )
+demo.queue()
 demo.launch(debug=True)