Spaces:
Runtime error
Runtime error
Run tokenizer before computing perplexity and format
Browse files- README.md +3 -3
- app.py +39 -9
- cli.py +64 -21
- perplexity_lenses/data.py +30 -5
- perplexity_lenses/engine.py +10 -2
- perplexity_lenses/perplexity.py +29 -4
- perplexity_lenses/visualization.py +27 -5
- requirements.txt +7 -7
- tests/test_data.py +3 -1
README.md
CHANGED
@@ -15,13 +15,13 @@ pip install -r requirements.txt
|
|
15 |
```
|
16 |
|
17 |
# Web App:
|
18 |
-
The app is hosted [here](https://huggingface.co/spaces/edugp/perplexity-lenses). To run it locally:
|
19 |
```
|
20 |
python -m streamlit run app.py
|
21 |
```
|
22 |
|
23 |
# CLI:
|
24 |
-
The CLI with no arguments defaults to running mc4 in Spanish.
|
25 |
For full usage:
|
26 |
```
|
27 |
python cli.py --help
|
@@ -43,4 +43,4 @@ python cli.py \
|
|
43 |
# Tests:
|
44 |
```
|
45 |
python -m unittest discover -s ./tests/ -p "test_*.py"
|
46 |
-
```
|
|
|
15 |
```
|
16 |
|
17 |
# Web App:
|
18 |
+
The app is hosted [here](https://huggingface.co/spaces/edugp/perplexity-lenses). To run it locally:
|
19 |
```
|
20 |
python -m streamlit run app.py
|
21 |
```
|
22 |
|
23 |
# CLI:
|
24 |
+
The CLI with no arguments defaults to running mc4 in Spanish.
|
25 |
For full usage:
|
26 |
```
|
27 |
python cli.py --help
|
|
|
43 |
# Tests:
|
44 |
```
|
45 |
python -m unittest discover -s ./tests/ -p "test_*.py"
|
46 |
+
```
|
app.py
CHANGED
@@ -3,11 +3,15 @@ from functools import partial
|
|
3 |
|
4 |
import streamlit as st
|
5 |
from embedding_lenses.data import uploaded_file_to_dataframe
|
6 |
-
from embedding_lenses.dimensionality_reduction import get_tsne_embeddings,
|
|
|
7 |
from embedding_lenses.embedding import load_model
|
8 |
|
9 |
-
from perplexity_lenses.data import documents_df_to_sentences_df,
|
10 |
-
|
|
|
|
|
|
|
11 |
from perplexity_lenses.perplexity import KenlmModel
|
12 |
|
13 |
logging.basicConfig(level=logging.INFO)
|
@@ -17,7 +21,9 @@ logger = logging.getLogger(__name__)
|
|
17 |
st.title("Perplexity Lenses")
|
18 |
st.write("Visualize text embeddings in 2D using colors to represent perplexity values.")
|
19 |
uploaded_file = st.file_uploader("Choose an csv/tsv file...", type=["csv", "tsv"])
|
20 |
-
st.write(
|
|
|
|
|
21 |
col1, col2, col3 = st.columns(3)
|
22 |
with col1:
|
23 |
hub_dataset = st.text_input("Dataset name", "mc4")
|
@@ -38,13 +44,17 @@ with col6:
|
|
38 |
with col7:
|
39 |
sample = st.number_input("Maximum number of documents to use", 1, 100000, 1000)
|
40 |
|
41 |
-
dimensionality_reduction = st.selectbox(
|
|
|
|
|
42 |
model_name = st.selectbox("Sentence embedding model", EMBEDDING_MODELS, 0)
|
43 |
|
44 |
with st.spinner(text="Loading embedding model..."):
|
45 |
model = load_model(model_name)
|
46 |
dimensionality_reduction_function = (
|
47 |
-
partial(get_umap_embeddings, random_state=SEED)
|
|
|
|
|
48 |
)
|
49 |
|
50 |
with st.spinner(text="Loading KenLM model..."):
|
@@ -58,12 +68,32 @@ if uploaded_file or hub_dataset:
|
|
58 |
df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED)
|
59 |
df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity)
|
60 |
else:
|
61 |
-
df = hub_dataset_to_dataframe(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
# Round perplexity
|
64 |
df["perplexity"] = df["perplexity"].round().astype(int)
|
65 |
-
logger.info(
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
logger.info("Displaying plot")
|
68 |
st.bokeh_chart(plot)
|
69 |
logger.info("Done")
|
|
|
3 |
|
4 |
import streamlit as st
|
5 |
from embedding_lenses.data import uploaded_file_to_dataframe
|
6 |
+
from embedding_lenses.dimensionality_reduction import (get_tsne_embeddings,
|
7 |
+
get_umap_embeddings)
|
8 |
from embedding_lenses.embedding import load_model
|
9 |
|
10 |
+
from perplexity_lenses.data import (documents_df_to_sentences_df,
|
11 |
+
hub_dataset_to_dataframe)
|
12 |
+
from perplexity_lenses.engine import (DIMENSIONALITY_REDUCTION_ALGORITHMS,
|
13 |
+
DOCUMENT_TYPES, EMBEDDING_MODELS,
|
14 |
+
LANGUAGES, SEED, generate_plot)
|
15 |
from perplexity_lenses.perplexity import KenlmModel
|
16 |
|
17 |
logging.basicConfig(level=logging.INFO)
|
|
|
21 |
st.title("Perplexity Lenses")
|
22 |
st.write("Visualize text embeddings in 2D using colors to represent perplexity values.")
|
23 |
uploaded_file = st.file_uploader("Choose an csv/tsv file...", type=["csv", "tsv"])
|
24 |
+
st.write(
|
25 |
+
"Alternatively, select a dataset from the [hub](https://huggingface.co/datasets)"
|
26 |
+
)
|
27 |
col1, col2, col3 = st.columns(3)
|
28 |
with col1:
|
29 |
hub_dataset = st.text_input("Dataset name", "mc4")
|
|
|
44 |
with col7:
|
45 |
sample = st.number_input("Maximum number of documents to use", 1, 100000, 1000)
|
46 |
|
47 |
+
dimensionality_reduction = st.selectbox(
|
48 |
+
"Dimensionality Reduction algorithm", DIMENSIONALITY_REDUCTION_ALGORITHMS, 0
|
49 |
+
)
|
50 |
model_name = st.selectbox("Sentence embedding model", EMBEDDING_MODELS, 0)
|
51 |
|
52 |
with st.spinner(text="Loading embedding model..."):
|
53 |
model = load_model(model_name)
|
54 |
dimensionality_reduction_function = (
|
55 |
+
partial(get_umap_embeddings, random_state=SEED)
|
56 |
+
if dimensionality_reduction == "UMAP"
|
57 |
+
else partial(get_tsne_embeddings, random_state=SEED)
|
58 |
)
|
59 |
|
60 |
with st.spinner(text="Loading KenLM model..."):
|
|
|
68 |
df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED)
|
69 |
df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity)
|
70 |
else:
|
71 |
+
df = hub_dataset_to_dataframe(
|
72 |
+
hub_dataset,
|
73 |
+
hub_dataset_config,
|
74 |
+
hub_dataset_split,
|
75 |
+
sample,
|
76 |
+
text_column,
|
77 |
+
kenlm_model,
|
78 |
+
seed=SEED,
|
79 |
+
doc_type=doc_type,
|
80 |
+
)
|
81 |
|
82 |
# Round perplexity
|
83 |
df["perplexity"] = df["perplexity"].round().astype(int)
|
84 |
+
logger.info(
|
85 |
+
f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}"
|
86 |
+
)
|
87 |
+
plot = generate_plot(
|
88 |
+
df,
|
89 |
+
text_column,
|
90 |
+
"perplexity",
|
91 |
+
None,
|
92 |
+
dimensionality_reduction_function,
|
93 |
+
model,
|
94 |
+
seed=SEED,
|
95 |
+
context_logger=st.spinner,
|
96 |
+
)
|
97 |
logger.info("Displaying plot")
|
98 |
st.bokeh_chart(plot)
|
99 |
logger.info("Done")
|
cli.py
CHANGED
@@ -2,15 +2,20 @@ import logging
|
|
2 |
from functools import partial
|
3 |
from typing import Optional
|
4 |
|
|
|
5 |
import typer
|
6 |
from bokeh.plotting import output_file as bokeh_output_file
|
7 |
from bokeh.plotting import save
|
8 |
from embedding_lenses.data import uploaded_file_to_dataframe
|
9 |
-
from embedding_lenses.dimensionality_reduction import get_tsne_embeddings,
|
|
|
10 |
from embedding_lenses.embedding import load_model
|
11 |
|
12 |
-
from perplexity_lenses.data import documents_df_to_sentences_df,
|
13 |
-
|
|
|
|
|
|
|
14 |
from perplexity_lenses.perplexity import KenlmModel
|
15 |
|
16 |
logging.basicConfig(level=logging.INFO)
|
@@ -22,19 +27,36 @@ app = typer.Typer()
|
|
22 |
|
23 |
@app.command()
|
24 |
def main(
|
25 |
-
dataset: str = typer.Option(
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
text_column: str = typer.Option("text", help="The text field name."),
|
29 |
-
language: str = typer.Option(
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
31 |
sample: int = typer.Option(1000, help="Maximum number of examples to use."),
|
32 |
dimensionality_reduction: str = typer.Option(
|
33 |
DIMENSIONALITY_REDUCTION_ALGORITHMS[0],
|
34 |
help=f"Whether to use UMAP or t-SNE for dimensionality reduction. Options: {DIMENSIONALITY_REDUCTION_ALGORITHMS}.",
|
35 |
),
|
36 |
-
model_name: str = typer.Option(
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
38 |
):
|
39 |
"""
|
40 |
Perplexity Lenses: Visualize text embeddings in 2D using colors to represent perplexity values.
|
@@ -42,26 +64,47 @@ def main(
|
|
42 |
logger.info("Loading embedding model...")
|
43 |
model = load_model(model_name)
|
44 |
dimensionality_reduction_function = (
|
45 |
-
partial(get_umap_embeddings, random_state=SEED)
|
|
|
|
|
46 |
)
|
47 |
logger.info("Loading KenLM model...")
|
48 |
kenlm_model = KenlmModel.from_pretrained(language)
|
49 |
logger.info("Loading dataset...")
|
50 |
if dataset.endswith(".csv") or dataset.endswith(".tsv"):
|
51 |
-
df =
|
52 |
if doc_type.lower() == "sentence":
|
53 |
df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED)
|
54 |
df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity)
|
55 |
else:
|
56 |
-
df = hub_dataset_to_dataframe(
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
|
67 |
if __name__ == "__main__":
|
|
|
2 |
from functools import partial
|
3 |
from typing import Optional
|
4 |
|
5 |
+
import pandas as pd
|
6 |
import typer
|
7 |
from bokeh.plotting import output_file as bokeh_output_file
|
8 |
from bokeh.plotting import save
|
9 |
from embedding_lenses.data import uploaded_file_to_dataframe
|
10 |
+
from embedding_lenses.dimensionality_reduction import (get_tsne_embeddings,
|
11 |
+
get_umap_embeddings)
|
12 |
from embedding_lenses.embedding import load_model
|
13 |
|
14 |
+
from perplexity_lenses.data import (documents_df_to_sentences_df,
|
15 |
+
hub_dataset_to_dataframe)
|
16 |
+
from perplexity_lenses.engine import (DIMENSIONALITY_REDUCTION_ALGORITHMS,
|
17 |
+
DOCUMENT_TYPES, EMBEDDING_MODELS,
|
18 |
+
LANGUAGES, SEED, generate_plot)
|
19 |
from perplexity_lenses.perplexity import KenlmModel
|
20 |
|
21 |
logging.basicConfig(level=logging.INFO)
|
|
|
27 |
|
28 |
@app.command()
|
29 |
def main(
|
30 |
+
dataset: str = typer.Option(
|
31 |
+
"mc4", help="The name of the hub dataset or local csv/tsv file."
|
32 |
+
),
|
33 |
+
dataset_config: Optional[str] = typer.Option(
|
34 |
+
"es",
|
35 |
+
help="The configuration of the hub dataset, if any. Does not apply to local csv/tsv files.",
|
36 |
+
),
|
37 |
+
dataset_split: Optional[str] = typer.Option(
|
38 |
+
"train", help="The dataset split. Does not apply to local csv/tsv files."
|
39 |
+
),
|
40 |
text_column: str = typer.Option("text", help="The text field name."),
|
41 |
+
language: str = typer.Option(
|
42 |
+
"es", help=f"The language of the text. Options: {LANGUAGES}"
|
43 |
+
),
|
44 |
+
doc_type: str = typer.Option(
|
45 |
+
"sentence",
|
46 |
+
help=f"Whether to embed at the sentence or document level. Options: {DOCUMENT_TYPES}.",
|
47 |
+
),
|
48 |
sample: int = typer.Option(1000, help="Maximum number of examples to use."),
|
49 |
dimensionality_reduction: str = typer.Option(
|
50 |
DIMENSIONALITY_REDUCTION_ALGORITHMS[0],
|
51 |
help=f"Whether to use UMAP or t-SNE for dimensionality reduction. Options: {DIMENSIONALITY_REDUCTION_ALGORITHMS}.",
|
52 |
),
|
53 |
+
model_name: str = typer.Option(
|
54 |
+
EMBEDDING_MODELS[0],
|
55 |
+
help=f"The sentence embedding model to use. Options: {EMBEDDING_MODELS}",
|
56 |
+
),
|
57 |
+
output_file: str = typer.Option(
|
58 |
+
"perplexity.html", help="The name of the output visualization HTML file."
|
59 |
+
),
|
60 |
):
|
61 |
"""
|
62 |
Perplexity Lenses: Visualize text embeddings in 2D using colors to represent perplexity values.
|
|
|
64 |
logger.info("Loading embedding model...")
|
65 |
model = load_model(model_name)
|
66 |
dimensionality_reduction_function = (
|
67 |
+
partial(get_umap_embeddings, random_state=SEED)
|
68 |
+
if dimensionality_reduction.lower() == "umap"
|
69 |
+
else partial(get_tsne_embeddings, random_state=SEED)
|
70 |
)
|
71 |
logger.info("Loading KenLM model...")
|
72 |
kenlm_model = KenlmModel.from_pretrained(language)
|
73 |
logger.info("Loading dataset...")
|
74 |
if dataset.endswith(".csv") or dataset.endswith(".tsv"):
|
75 |
+
df = pd.read_csv(dataset, sep="\t" if dataset.endswith(".tsv") else ",")
|
76 |
if doc_type.lower() == "sentence":
|
77 |
df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED)
|
78 |
df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity)
|
79 |
else:
|
80 |
+
df = hub_dataset_to_dataframe(
|
81 |
+
dataset,
|
82 |
+
dataset_config,
|
83 |
+
dataset_split,
|
84 |
+
sample,
|
85 |
+
text_column,
|
86 |
+
kenlm_model,
|
87 |
+
seed=SEED,
|
88 |
+
doc_type=doc_type,
|
89 |
+
)
|
90 |
+
# Round perplexity
|
91 |
+
df["perplexity"] = df["perplexity"].round().astype(int)
|
92 |
+
logger.info(
|
93 |
+
f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}"
|
94 |
+
)
|
95 |
+
plot = generate_plot(
|
96 |
+
df,
|
97 |
+
text_column,
|
98 |
+
"perplexity",
|
99 |
+
None,
|
100 |
+
dimensionality_reduction_function,
|
101 |
+
model,
|
102 |
+
seed=SEED,
|
103 |
+
)
|
104 |
+
logger.info("Saving plot")
|
105 |
+
bokeh_output_file(output_file)
|
106 |
+
save(plot)
|
107 |
+
logger.info("Done")
|
108 |
|
109 |
|
110 |
if __name__ == "__main__":
|
perplexity_lenses/data.py
CHANGED
@@ -9,7 +9,14 @@ from perplexity_lenses.perplexity import KenlmModel
|
|
9 |
|
10 |
|
11 |
def hub_dataset_to_dataframe(
|
12 |
-
path: str,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
) -> pd.DataFrame:
|
14 |
load_dataset_fn = partial(load_dataset, path=path)
|
15 |
if name:
|
@@ -18,9 +25,19 @@ def hub_dataset_to_dataframe(
|
|
18 |
load_dataset_fn = partial(load_dataset_fn, split=split)
|
19 |
dataset = load_dataset_fn(streaming=True).shuffle(buffer_size=10000, seed=seed)
|
20 |
if doc_type.lower() == "sentence":
|
21 |
-
dataset = dataset.map(
|
|
|
|
|
|
|
|
|
|
|
22 |
else:
|
23 |
-
dataset = dataset.map(
|
|
|
|
|
|
|
|
|
|
|
24 |
instances = []
|
25 |
count = 0
|
26 |
for instance in tqdm(dataset, total=sample):
|
@@ -38,6 +55,14 @@ def hub_dataset_to_dataframe(
|
|
38 |
return pd.DataFrame(instances)
|
39 |
|
40 |
|
41 |
-
def documents_df_to_sentences_df(
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
return df_sentences.sample(min(sample, df_sentences.shape[0]), random_state=seed)
|
|
|
9 |
|
10 |
|
11 |
def hub_dataset_to_dataframe(
|
12 |
+
path: str,
|
13 |
+
name: str,
|
14 |
+
split: str,
|
15 |
+
sample: int,
|
16 |
+
text_column: str,
|
17 |
+
model: KenlmModel,
|
18 |
+
seed: int = 0,
|
19 |
+
doc_type: str = "Whole document",
|
20 |
) -> pd.DataFrame:
|
21 |
load_dataset_fn = partial(load_dataset, path=path)
|
22 |
if name:
|
|
|
25 |
load_dataset_fn = partial(load_dataset_fn, split=split)
|
26 |
dataset = load_dataset_fn(streaming=True).shuffle(buffer_size=10000, seed=seed)
|
27 |
if doc_type.lower() == "sentence":
|
28 |
+
dataset = dataset.map(
|
29 |
+
lambda x: [
|
30 |
+
{text_column: sentence, "perplexity": model.get_perplexity(sentence)}
|
31 |
+
for sentence in x[text_column].split("\n")
|
32 |
+
]
|
33 |
+
)
|
34 |
else:
|
35 |
+
dataset = dataset.map(
|
36 |
+
lambda x: {
|
37 |
+
text_column: x[text_column],
|
38 |
+
"perplexity": model.get_perplexity(x[text_column]),
|
39 |
+
}
|
40 |
+
)
|
41 |
instances = []
|
42 |
count = 0
|
43 |
for instance in tqdm(dataset, total=sample):
|
|
|
55 |
return pd.DataFrame(instances)
|
56 |
|
57 |
|
58 |
+
def documents_df_to_sentences_df(
|
59 |
+
df: pd.DataFrame, text_column: str, sample: int, seed: int = 0
|
60 |
+
):
|
61 |
+
df_sentences = pd.DataFrame(
|
62 |
+
{
|
63 |
+
text_column: np.array(
|
64 |
+
df[text_column].map(lambda x: x.split("\n")).values.tolist()
|
65 |
+
).flatten()
|
66 |
+
}
|
67 |
+
)
|
68 |
return df_sentences.sample(min(sample, df_sentences.shape[0]), random_state=seed)
|
perplexity_lenses/engine.py
CHANGED
@@ -96,7 +96,9 @@ def generate_plot(
|
|
96 |
context_logger: Union[st.spinner, ContextLogger] = ContextLogger,
|
97 |
) -> Figure:
|
98 |
if text_column not in df.columns:
|
99 |
-
raise ValueError(
|
|
|
|
|
100 |
if label_column not in df.columns:
|
101 |
df[label_column] = 0
|
102 |
df = df.dropna(subset=[text_column, label_column])
|
@@ -110,6 +112,12 @@ def generate_plot(
|
|
110 |
embeddings_2d = dimensionality_reduction_function(embeddings)
|
111 |
logger.info("Generating figure")
|
112 |
plot = draw_interactive_scatter_plot(
|
113 |
-
df[text_column].values,
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
)
|
115 |
return plot
|
|
|
96 |
context_logger: Union[st.spinner, ContextLogger] = ContextLogger,
|
97 |
) -> Figure:
|
98 |
if text_column not in df.columns:
|
99 |
+
raise ValueError(
|
100 |
+
f"The specified column name doesn't exist. Columns available: {df.columns.values}"
|
101 |
+
)
|
102 |
if label_column not in df.columns:
|
103 |
df[label_column] = 0
|
104 |
df = df.dropna(subset=[text_column, label_column])
|
|
|
112 |
embeddings_2d = dimensionality_reduction_function(embeddings)
|
113 |
logger.info("Generating figure")
|
114 |
plot = draw_interactive_scatter_plot(
|
115 |
+
df[text_column].values,
|
116 |
+
embeddings_2d[:, 0],
|
117 |
+
embeddings_2d[:, 1],
|
118 |
+
encoded_labels.values,
|
119 |
+
df[label_column].values,
|
120 |
+
text_column,
|
121 |
+
label_column,
|
122 |
)
|
123 |
return plot
|
perplexity_lenses/perplexity.py
CHANGED
@@ -5,6 +5,21 @@ import urllib.request
|
|
5 |
from typing import Dict
|
6 |
|
7 |
import kenlm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
|
10 |
class KenlmModel:
|
@@ -46,32 +61,42 @@ class KenlmModel:
|
|
46 |
"►": "-",
|
47 |
}
|
48 |
unicode_punct_re = re.compile(f"[{''.join(unicode_punct.keys())}]")
|
49 |
-
non_printing_chars_re = re.compile(
|
|
|
|
|
50 |
|
51 |
def __init__(self, language):
|
52 |
download_kenlm_model(language)
|
53 |
try:
|
54 |
self.model = kenlm.Model(f"{language}.arpa.bin")
|
|
|
55 |
except OSError:
|
56 |
os.remove(f"{language}.arpa.bin")
|
57 |
if os.path.exists(f"{language}.sp.model"):
|
58 |
os.remove(f"{language}.sp.model")
|
59 |
-
raise OSError(
|
|
|
|
|
60 |
|
61 |
@classmethod
|
62 |
def from_pretrained(cls, language: str):
|
63 |
return cls(language)
|
64 |
|
|
|
|
|
|
|
65 |
def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
|
66 |
if normalize_cc_net:
|
67 |
doc = self.normalize(doc)
|
|
|
|
|
68 |
doc_log_score, doc_length = 0, 0
|
69 |
for line in doc.split("\n"):
|
70 |
log_score = self.model.score(line)
|
71 |
length = len(line.split()) + 1
|
72 |
doc_log_score += log_score
|
73 |
doc_length += length
|
74 |
-
return
|
75 |
|
76 |
def normalize(
|
77 |
self,
|
@@ -106,7 +131,7 @@ class KenlmModel:
|
|
106 |
return "".join(output)
|
107 |
|
108 |
def replace_unicode_punct(self, text: str) -> str:
|
109 |
-
return "".join(
|
110 |
|
111 |
def remove_unicode_punct(self, text: str) -> str:
|
112 |
"""More aggressive version of replace_unicode_punct but also faster."""
|
|
|
5 |
from typing import Dict
|
6 |
|
7 |
import kenlm
|
8 |
+
import sentencepiece
|
9 |
+
|
10 |
+
|
11 |
+
class SentencePiece:
|
12 |
+
def __init__(
|
13 |
+
self,
|
14 |
+
model: str,
|
15 |
+
):
|
16 |
+
super().__init__()
|
17 |
+
self.sp = sentencepiece.SentencePieceProcessor()
|
18 |
+
self.sp.load(str(model))
|
19 |
+
|
20 |
+
def do(self, text: dict) -> dict:
|
21 |
+
tokenized = self.sp.encode_as_pieces(text)
|
22 |
+
return " ".join(tokenized)
|
23 |
|
24 |
|
25 |
class KenlmModel:
|
|
|
61 |
"►": "-",
|
62 |
}
|
63 |
unicode_punct_re = re.compile(f"[{''.join(unicode_punct.keys())}]")
|
64 |
+
non_printing_chars_re = re.compile(
|
65 |
+
f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
|
66 |
+
)
|
67 |
|
68 |
def __init__(self, language):
|
69 |
download_kenlm_model(language)
|
70 |
try:
|
71 |
self.model = kenlm.Model(f"{language}.arpa.bin")
|
72 |
+
self.tokenizer = SentencePiece(f"{language}.sp.model")
|
73 |
except OSError:
|
74 |
os.remove(f"{language}.arpa.bin")
|
75 |
if os.path.exists(f"{language}.sp.model"):
|
76 |
os.remove(f"{language}.sp.model")
|
77 |
+
raise OSError(
|
78 |
+
"File was corrupt and should have been removed. Please, retry."
|
79 |
+
)
|
80 |
|
81 |
@classmethod
|
82 |
def from_pretrained(cls, language: str):
|
83 |
return cls(language)
|
84 |
|
85 |
+
def pp(self, log_score, length):
|
86 |
+
return 10.0 ** (-log_score / length)
|
87 |
+
|
88 |
def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
|
89 |
if normalize_cc_net:
|
90 |
doc = self.normalize(doc)
|
91 |
+
# Tokenize (after normalizing): See https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/mine.py#L352 for full pipeline
|
92 |
+
doc = self.tokenizer.do(doc)
|
93 |
doc_log_score, doc_length = 0, 0
|
94 |
for line in doc.split("\n"):
|
95 |
log_score = self.model.score(line)
|
96 |
length = len(line.split()) + 1
|
97 |
doc_log_score += log_score
|
98 |
doc_length += length
|
99 |
+
return round(self.pp(doc_log_score, doc_length), 1)
|
100 |
|
101 |
def normalize(
|
102 |
self,
|
|
|
131 |
return "".join(output)
|
132 |
|
133 |
def replace_unicode_punct(self, text: str) -> str:
|
134 |
+
return "".join(self.unicode_punct.get(c, c) for c in text)
|
135 |
|
136 |
def remove_unicode_punct(self, text: str) -> str:
|
137 |
"""More aggressive version of replace_unicode_punct but also faster."""
|
perplexity_lenses/visualization.py
CHANGED
@@ -6,7 +6,13 @@ from bokeh.transform import factor_cmap
|
|
6 |
|
7 |
|
8 |
def draw_interactive_scatter_plot(
|
9 |
-
texts: np.ndarray,
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
) -> Figure:
|
11 |
# Smooth down values for coloring, by taking the entropy = log10(perplexity) and multiply it by 10000
|
12 |
values = ((np.log10(values)) * 10000).round().astype(int)
|
@@ -16,17 +22,33 @@ def draw_interactive_scatter_plot(
|
|
16 |
if max_value - min_value == 0:
|
17 |
values_color = np.ones(len(values))
|
18 |
else:
|
19 |
-
values_color = (
|
|
|
|
|
20 |
values_color_sorted = sorted(values_color)
|
21 |
|
22 |
values_list = values.astype(str).tolist()
|
23 |
values_sorted = sorted(values_list)
|
24 |
labels_list = labels.astype(str).tolist()
|
25 |
|
26 |
-
source = ColumnDataSource(
|
27 |
-
|
|
|
|
|
|
|
|
|
28 |
p = figure(plot_width=800, plot_height=800, tools=[hover])
|
29 |
-
p.circle(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
p.axis.visible = False
|
32 |
p.xgrid.grid_line_color = None
|
|
|
6 |
|
7 |
|
8 |
def draw_interactive_scatter_plot(
|
9 |
+
texts: np.ndarray,
|
10 |
+
xs: np.ndarray,
|
11 |
+
ys: np.ndarray,
|
12 |
+
values: np.ndarray,
|
13 |
+
labels: np.ndarray,
|
14 |
+
text_column: str,
|
15 |
+
label_column: str,
|
16 |
) -> Figure:
|
17 |
# Smooth down values for coloring, by taking the entropy = log10(perplexity) and multiply it by 10000
|
18 |
values = ((np.log10(values)) * 10000).round().astype(int)
|
|
|
22 |
if max_value - min_value == 0:
|
23 |
values_color = np.ones(len(values))
|
24 |
else:
|
25 |
+
values_color = (
|
26 |
+
((values - min_value) / (max_value - min_value) * 255).round().astype(int)
|
27 |
+
)
|
28 |
values_color_sorted = sorted(values_color)
|
29 |
|
30 |
values_list = values.astype(str).tolist()
|
31 |
values_sorted = sorted(values_list)
|
32 |
labels_list = labels.astype(str).tolist()
|
33 |
|
34 |
+
source = ColumnDataSource(
|
35 |
+
data=dict(x=xs, y=ys, text=texts, label=values_list, original_label=labels_list)
|
36 |
+
)
|
37 |
+
hover = HoverTool(
|
38 |
+
tooltips=[(text_column, "@text{safe}"), (label_column, "@original_label")]
|
39 |
+
)
|
40 |
p = figure(plot_width=800, plot_height=800, tools=[hover])
|
41 |
+
p.circle(
|
42 |
+
"x",
|
43 |
+
"y",
|
44 |
+
size=10,
|
45 |
+
source=source,
|
46 |
+
fill_color=factor_cmap(
|
47 |
+
"label",
|
48 |
+
palette=[Pallete[id_] for id_ in values_color_sorted],
|
49 |
+
factors=values_sorted,
|
50 |
+
),
|
51 |
+
)
|
52 |
|
53 |
p.axis.visible = False
|
54 |
p.xgrid.grid_line_color = None
|
requirements.txt
CHANGED
@@ -1,11 +1,11 @@
|
|
|
|
|
|
|
|
1 |
huggingface-hub==0.0.19
|
|
|
|
|
2 |
streamlit==1.1.0
|
3 |
transformers==4.11.3
|
4 |
-
|
5 |
-
sentence-transformers==2.0.0
|
6 |
-
bokeh==2.2.2
|
7 |
umap-learn==0.5.2
|
8 |
-
|
9 |
-
https://files.pythonhosted.org/packages/2f/58/e00d2495b54f4ba97ca31a11aa7e636f80183ccf9b616f7eaa5518d050bb/embedding_lenses-0.5.0-py3-none-any.whl
|
10 |
-
https://github.com/kpu/kenlm/archive/master.zip
|
11 |
-
typer==0.4.0
|
|
|
1 |
+
bokeh==2.2.2
|
2 |
+
https://files.pythonhosted.org/packages/2f/58/e00d2495b54f4ba97ca31a11aa7e636f80183ccf9b616f7eaa5518d050bb/embedding_lenses-0.5.0-py3-none-any.whl
|
3 |
+
https://github.com/kpu/kenlm/archive/master.zip
|
4 |
huggingface-hub==0.0.19
|
5 |
+
numpy==1.20.0
|
6 |
+
sentence-transformers==2.0.0
|
7 |
streamlit==1.1.0
|
8 |
transformers==4.11.3
|
9 |
+
typer==0.4.0
|
|
|
|
|
10 |
umap-learn==0.5.2
|
11 |
+
watchdog==2.1.3
|
|
|
|
|
|
tests/test_data.py
CHANGED
@@ -10,4 +10,6 @@ class TestData(unittest.TestCase):
|
|
10 |
input_df = pd.DataFrame({"text": ["foo\nbar"]})
|
11 |
expected_output_df = pd.DataFrame({"text": ["foo", "bar"]})
|
12 |
output_df = documents_df_to_sentences_df(input_df, "text", 100)
|
13 |
-
pd.testing.assert_frame_equal(
|
|
|
|
|
|
10 |
input_df = pd.DataFrame({"text": ["foo\nbar"]})
|
11 |
expected_output_df = pd.DataFrame({"text": ["foo", "bar"]})
|
12 |
output_df = documents_df_to_sentences_df(input_df, "text", 100)
|
13 |
+
pd.testing.assert_frame_equal(
|
14 |
+
output_df, expected_output_df, check_like=True, check_exact=True
|
15 |
+
)
|