File size: 5,227 Bytes
1dc581a
999a2cb
1dc581a
999a2cb
1dc581a
8ed11d6
0b6e959
 
68135f3
0b6e959
 
fc2149d
0b6e959
1dc581a
85c3256
8ed11d6
1dc581a
a695cd7
8ed11d6
a695cd7
4fd99d4
85c3256
a695cd7
1dc581a
a695cd7
999a2cb
 
 
 
 
 
 
 
 
8154f86
999a2cb
1dc581a
3d48f09
 
999a2cb
8154f86
8ed11d6
 
 
3d48f09
1dc581a
999a2cb
 
 
 
a695cd7
999a2cb
 
 
4fd99d4
999a2cb
3e1e93b
999a2cb
 
 
 
 
4fd99d4
8ed11d6
4fd99d4
999a2cb
a695cd7
999a2cb
c09c624
999a2cb
cbccbc9
 
 
 
c09c624
 
cbccbc9
75c03f0
 
 
999a2cb
4fd99d4
999a2cb
 
4fd99d4
999a2cb
 
a695cd7
4fd99d4
999a2cb
1dc581a
999a2cb
3d48f09
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import random
from gliner import GLiNER
import gradio as gr
from datasets import load_dataset

# Load the BL dataset
dataset_iter = iter(
    load_dataset(
        "max-long/bl_books_textile_filter",
        split="train",
        trust_remote_code=True
    ).shuffle(seed=42)  # Shuffling added here
)

# Load the model
model = GLiNER.from_pretrained("max-long/textile_machines_ner_5_oct", trust_remote_code=True)

def ner(text: str):
    labels = ["Textile Machinery"] 
    threshold = 0.5
    
    # Predict entities using the fine-tuned GLiNER model
    entities = model.predict_entities(text, labels, flat_ner=True, threshold=threshold)
    
    # Filter for "Textile Machinery" entities
    textile_entities = [
        {
            "entity": ent["label"],
            "word": ent["text"],
            "start": ent["start"],
            "end": ent["end"],
            "score": ent.get("score", 0),
        }
        for ent in entities
        if ent["label"] == "Textile Machinery"
    ]
    
    # Prepare entities for color-coded display using gr.HighlightedText in the required dictionary format
    highlights = [{"start": ent["start"], "end": ent["end"], "entity": ent["entity"]} for ent in textile_entities]
    
    # Return two outputs: one for the highlighted text and one for the entities in JSON format
    return {
        "text": text,
        "entities": highlights
    }, textile_entities

with gr.Blocks(title="Textile Machinery NER Demo") as demo:
    gr.Markdown(
        """
        # Textile Machinery Entity Recognition Demo
        This demo selects a random text snippet from a subset of the British Library's books dataset and identifies "Textile Machinery" entities using a fine-tuned GLiNER model.
        """
    )
    
    # Display a random example
    input_text = gr.Textbox(
        value="The  machine  is  fed  by  means  of  an  endless  apron,  the wool  entering  at  the  smaller  end,  so  that  when  most  entangled it  is  subjected  to  the  least  motion.  This  apron  is  a  great improvement  on  former  machines,  which  were  filled  by  hand, an  operation  attended  with  danger,  and  sometimes  resulting  in accidents.  By  the  revolutions  of  the  cylinder,  the  wool  is  torn, disentangled,  and  cleaned,  and  by  the  gradually  increasing centrifugal  force,  it  is  impelled  forwards  towards  the  large end  of  the  cone,  encountering  in  its  way  increased  motion ; which,  however,  it  is  better  able  to  bear  by  becoming  less  and less  entangled  at  every  revolution. When  the  wool  thus  reaches  the  base  of  the  cone,  it  is tossed  into  a  chamber,  where  it  is  received  upon  another  end- less apron,  moving  in  a  direction  from  the  machine  instead  of towards  it.  Over  this  apron  is  a  cylindrical  wire  cage,  which revolves  on  an  axis  disposed  parallel  to  the  apron,  and  im- mediately over  it  is  a  revolving  fan.  Both  these  are  covered and  protected  by  sheet  iron  casings,  but  communicate  with the  chamber  which  receives  the  wool  from  the  cone.  When the  whole  is  at  work,  the  fan,  drawing  the  dust  out  of  the chamber,  blows  it  through  a  chimney,  or  pipe,  connected  with the  machine  for  the  purpose.  The  cage  prevents  the  escape of  the  wool  with  the  dust,  and,  by  its  passage  over  the  apron, it  lays  down  the  wool  in  a  continuous  fleece. The  coarser  wools,  destined  for  common  cloths,  are  willied more  than  once ;  for  instance,  before  and  after  dyeing,  and after  oiling,  and  before  they  are  scribbled ;  the  finer  wools  do not,  however,  require  this,  as  the  operation  of  scribbling  is  a sufficient  preparation  for  carding. In  the  West  of  England,  the  wool  is  beaten  with  wooden rollers,  by  women,  after  which  it  is  placed  in  a  wire  screen, or  hurdle,  and  pulled  with  the  hands,  so  as  to  get  rid  of  any burs  or  pitch,  or  other  dirt  which  may  not  have  been  separ- ated by  the  willy.",
        label="Text input",
        placeholder="Enter your text here",
        lines=5
    )
    
    # Define output components
    output_highlighted = gr.HighlightedText(label="Predicted Entities")
    output_entities = gr.JSON(label="Entities")
    
    submit_btn = gr.Button("Find Textile Machinery!")
    refresh_btn = gr.Button("Get New Snippet")

    def get_new_snippet():
        attempts = 0
        max_attempts = 1000  # Prevent infinite loops
        while attempts < max_attempts:
            try:
                sample = next(dataset_iter)['text']
                return sample
            except StopIteration:
                break  # Exit the loop if we run out of snippets
            attempts += 1  # Increment attempts
        return "No more snippets available."  # Return this if no valid snippets are found  
    
    # Connect refresh button
    refresh_btn.click(fn=get_new_snippet, outputs=input_text)
    
    # Connect submit button
    submit_btn.click(
        fn=ner,
        inputs=[input_text],
        outputs=[output_highlighted, output_entities]
    )

demo.queue()
demo.launch(debug=True, share=True)