edugp commited on
Commit
7b62017
1 Parent(s): 0def03f

Run tokenizer before computing perplexity and format

Browse files
README.md CHANGED
@@ -15,13 +15,13 @@ pip install -r requirements.txt
15
  ```
16
 
17
  # Web App:
18
- The app is hosted [here](https://huggingface.co/spaces/edugp/perplexity-lenses). To run it locally:
19
  ```
20
  python -m streamlit run app.py
21
  ```
22
 
23
  # CLI:
24
- The CLI with no arguments defaults to running mc4 in Spanish.
25
  For full usage:
26
  ```
27
  python cli.py --help
@@ -43,4 +43,4 @@ python cli.py \
43
  # Tests:
44
  ```
45
  python -m unittest discover -s ./tests/ -p "test_*.py"
46
- ```
 
15
  ```
16
 
17
  # Web App:
18
+ The app is hosted [here](https://huggingface.co/spaces/edugp/perplexity-lenses). To run it locally:
19
  ```
20
  python -m streamlit run app.py
21
  ```
22
 
23
  # CLI:
24
+ The CLI with no arguments defaults to running mc4 in Spanish.
25
  For full usage:
26
  ```
27
  python cli.py --help
 
43
  # Tests:
44
  ```
45
  python -m unittest discover -s ./tests/ -p "test_*.py"
46
+ ```
app.py CHANGED
@@ -3,11 +3,15 @@ from functools import partial
3
 
4
  import streamlit as st
5
  from embedding_lenses.data import uploaded_file_to_dataframe
6
- from embedding_lenses.dimensionality_reduction import get_tsne_embeddings, get_umap_embeddings
 
7
  from embedding_lenses.embedding import load_model
8
 
9
- from perplexity_lenses.data import documents_df_to_sentences_df, hub_dataset_to_dataframe
10
- from perplexity_lenses.engine import DIMENSIONALITY_REDUCTION_ALGORITHMS, DOCUMENT_TYPES, EMBEDDING_MODELS, LANGUAGES, SEED, generate_plot
 
 
 
11
  from perplexity_lenses.perplexity import KenlmModel
12
 
13
  logging.basicConfig(level=logging.INFO)
@@ -17,7 +21,9 @@ logger = logging.getLogger(__name__)
17
  st.title("Perplexity Lenses")
18
  st.write("Visualize text embeddings in 2D using colors to represent perplexity values.")
19
  uploaded_file = st.file_uploader("Choose an csv/tsv file...", type=["csv", "tsv"])
20
- st.write("Alternatively, select a dataset from the [hub](https://huggingface.co/datasets)")
 
 
21
  col1, col2, col3 = st.columns(3)
22
  with col1:
23
  hub_dataset = st.text_input("Dataset name", "mc4")
@@ -38,13 +44,17 @@ with col6:
38
  with col7:
39
  sample = st.number_input("Maximum number of documents to use", 1, 100000, 1000)
40
 
41
- dimensionality_reduction = st.selectbox("Dimensionality Reduction algorithm", DIMENSIONALITY_REDUCTION_ALGORITHMS, 0)
 
 
42
  model_name = st.selectbox("Sentence embedding model", EMBEDDING_MODELS, 0)
43
 
44
  with st.spinner(text="Loading embedding model..."):
45
  model = load_model(model_name)
46
  dimensionality_reduction_function = (
47
- partial(get_umap_embeddings, random_state=SEED) if dimensionality_reduction == "UMAP" else partial(get_tsne_embeddings, random_state=SEED)
 
 
48
  )
49
 
50
  with st.spinner(text="Loading KenLM model..."):
@@ -58,12 +68,32 @@ if uploaded_file or hub_dataset:
58
  df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED)
59
  df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity)
60
  else:
61
- df = hub_dataset_to_dataframe(hub_dataset, hub_dataset_config, hub_dataset_split, sample, text_column, kenlm_model, seed=SEED, doc_type=doc_type)
 
 
 
 
 
 
 
 
 
62
 
63
  # Round perplexity
64
  df["perplexity"] = df["perplexity"].round().astype(int)
65
- logger.info(f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}")
66
- plot = generate_plot(df, text_column, "perplexity", None, dimensionality_reduction_function, model, seed=SEED, context_logger=st.spinner)
 
 
 
 
 
 
 
 
 
 
 
67
  logger.info("Displaying plot")
68
  st.bokeh_chart(plot)
69
  logger.info("Done")
 
3
 
4
  import streamlit as st
5
  from embedding_lenses.data import uploaded_file_to_dataframe
6
+ from embedding_lenses.dimensionality_reduction import (get_tsne_embeddings,
7
+ get_umap_embeddings)
8
  from embedding_lenses.embedding import load_model
9
 
10
+ from perplexity_lenses.data import (documents_df_to_sentences_df,
11
+ hub_dataset_to_dataframe)
12
+ from perplexity_lenses.engine import (DIMENSIONALITY_REDUCTION_ALGORITHMS,
13
+ DOCUMENT_TYPES, EMBEDDING_MODELS,
14
+ LANGUAGES, SEED, generate_plot)
15
  from perplexity_lenses.perplexity import KenlmModel
16
 
17
  logging.basicConfig(level=logging.INFO)
 
21
  st.title("Perplexity Lenses")
22
  st.write("Visualize text embeddings in 2D using colors to represent perplexity values.")
23
  uploaded_file = st.file_uploader("Choose an csv/tsv file...", type=["csv", "tsv"])
24
+ st.write(
25
+ "Alternatively, select a dataset from the [hub](https://huggingface.co/datasets)"
26
+ )
27
  col1, col2, col3 = st.columns(3)
28
  with col1:
29
  hub_dataset = st.text_input("Dataset name", "mc4")
 
44
  with col7:
45
  sample = st.number_input("Maximum number of documents to use", 1, 100000, 1000)
46
 
47
+ dimensionality_reduction = st.selectbox(
48
+ "Dimensionality Reduction algorithm", DIMENSIONALITY_REDUCTION_ALGORITHMS, 0
49
+ )
50
  model_name = st.selectbox("Sentence embedding model", EMBEDDING_MODELS, 0)
51
 
52
  with st.spinner(text="Loading embedding model..."):
53
  model = load_model(model_name)
54
  dimensionality_reduction_function = (
55
+ partial(get_umap_embeddings, random_state=SEED)
56
+ if dimensionality_reduction == "UMAP"
57
+ else partial(get_tsne_embeddings, random_state=SEED)
58
  )
59
 
60
  with st.spinner(text="Loading KenLM model..."):
 
68
  df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED)
69
  df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity)
70
  else:
71
+ df = hub_dataset_to_dataframe(
72
+ hub_dataset,
73
+ hub_dataset_config,
74
+ hub_dataset_split,
75
+ sample,
76
+ text_column,
77
+ kenlm_model,
78
+ seed=SEED,
79
+ doc_type=doc_type,
80
+ )
81
 
82
  # Round perplexity
83
  df["perplexity"] = df["perplexity"].round().astype(int)
84
+ logger.info(
85
+ f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}"
86
+ )
87
+ plot = generate_plot(
88
+ df,
89
+ text_column,
90
+ "perplexity",
91
+ None,
92
+ dimensionality_reduction_function,
93
+ model,
94
+ seed=SEED,
95
+ context_logger=st.spinner,
96
+ )
97
  logger.info("Displaying plot")
98
  st.bokeh_chart(plot)
99
  logger.info("Done")
cli.py CHANGED
@@ -2,15 +2,20 @@ import logging
2
  from functools import partial
3
  from typing import Optional
4
 
 
5
  import typer
6
  from bokeh.plotting import output_file as bokeh_output_file
7
  from bokeh.plotting import save
8
  from embedding_lenses.data import uploaded_file_to_dataframe
9
- from embedding_lenses.dimensionality_reduction import get_tsne_embeddings, get_umap_embeddings
 
10
  from embedding_lenses.embedding import load_model
11
 
12
- from perplexity_lenses.data import documents_df_to_sentences_df, hub_dataset_to_dataframe
13
- from perplexity_lenses.engine import DIMENSIONALITY_REDUCTION_ALGORITHMS, DOCUMENT_TYPES, EMBEDDING_MODELS, LANGUAGES, SEED, generate_plot
 
 
 
14
  from perplexity_lenses.perplexity import KenlmModel
15
 
16
  logging.basicConfig(level=logging.INFO)
@@ -22,19 +27,36 @@ app = typer.Typer()
22
 
23
  @app.command()
24
  def main(
25
- dataset: str = typer.Option("mc4", help="The name of the hub dataset or local csv/tsv file."),
26
- dataset_config: Optional[str] = typer.Option("es", help="The configuration of the hub dataset, if any. Does not apply to local csv/tsv files."),
27
- dataset_split: Optional[str] = typer.Option("train", help="The dataset split. Does not apply to local csv/tsv files."),
 
 
 
 
 
 
 
28
  text_column: str = typer.Option("text", help="The text field name."),
29
- language: str = typer.Option("es", help=f"The language of the text. Options: {LANGUAGES}"),
30
- doc_type: str = typer.Option("sentence", help=f"Whether to embed at the sentence or document level. Options: {DOCUMENT_TYPES}."),
 
 
 
 
 
31
  sample: int = typer.Option(1000, help="Maximum number of examples to use."),
32
  dimensionality_reduction: str = typer.Option(
33
  DIMENSIONALITY_REDUCTION_ALGORITHMS[0],
34
  help=f"Whether to use UMAP or t-SNE for dimensionality reduction. Options: {DIMENSIONALITY_REDUCTION_ALGORITHMS}.",
35
  ),
36
- model_name: str = typer.Option(EMBEDDING_MODELS[0], help=f"The sentence embedding model to use. Options: {EMBEDDING_MODELS}"),
37
- output_file: str = typer.Option("perplexity.html", help="The name of the output visualization HTML file."),
 
 
 
 
 
38
  ):
39
  """
40
  Perplexity Lenses: Visualize text embeddings in 2D using colors to represent perplexity values.
@@ -42,26 +64,47 @@ def main(
42
  logger.info("Loading embedding model...")
43
  model = load_model(model_name)
44
  dimensionality_reduction_function = (
45
- partial(get_umap_embeddings, random_state=SEED) if dimensionality_reduction.lower() == "umap" else partial(get_tsne_embeddings, random_state=SEED)
 
 
46
  )
47
  logger.info("Loading KenLM model...")
48
  kenlm_model = KenlmModel.from_pretrained(language)
49
  logger.info("Loading dataset...")
50
  if dataset.endswith(".csv") or dataset.endswith(".tsv"):
51
- df = uploaded_file_to_dataframe(dataset)
52
  if doc_type.lower() == "sentence":
53
  df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED)
54
  df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity)
55
  else:
56
- df = hub_dataset_to_dataframe(dataset, dataset_config, dataset_split, sample, text_column, kenlm_model, seed=SEED, doc_type=doc_type)
57
- # Round perplexity
58
- df["perplexity"] = df["perplexity"].round().astype(int)
59
- logger.info(f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}")
60
- plot = generate_plot(df, text_column, "perplexity", None, dimensionality_reduction_function, model, seed=SEED)
61
- logger.info("Saving plot")
62
- bokeh_output_file(output_file)
63
- save(plot)
64
- logger.info("Done")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
  if __name__ == "__main__":
 
2
  from functools import partial
3
  from typing import Optional
4
 
5
+ import pandas as pd
6
  import typer
7
  from bokeh.plotting import output_file as bokeh_output_file
8
  from bokeh.plotting import save
9
  from embedding_lenses.data import uploaded_file_to_dataframe
10
+ from embedding_lenses.dimensionality_reduction import (get_tsne_embeddings,
11
+ get_umap_embeddings)
12
  from embedding_lenses.embedding import load_model
13
 
14
+ from perplexity_lenses.data import (documents_df_to_sentences_df,
15
+ hub_dataset_to_dataframe)
16
+ from perplexity_lenses.engine import (DIMENSIONALITY_REDUCTION_ALGORITHMS,
17
+ DOCUMENT_TYPES, EMBEDDING_MODELS,
18
+ LANGUAGES, SEED, generate_plot)
19
  from perplexity_lenses.perplexity import KenlmModel
20
 
21
  logging.basicConfig(level=logging.INFO)
 
27
 
28
  @app.command()
29
  def main(
30
+ dataset: str = typer.Option(
31
+ "mc4", help="The name of the hub dataset or local csv/tsv file."
32
+ ),
33
+ dataset_config: Optional[str] = typer.Option(
34
+ "es",
35
+ help="The configuration of the hub dataset, if any. Does not apply to local csv/tsv files.",
36
+ ),
37
+ dataset_split: Optional[str] = typer.Option(
38
+ "train", help="The dataset split. Does not apply to local csv/tsv files."
39
+ ),
40
  text_column: str = typer.Option("text", help="The text field name."),
41
+ language: str = typer.Option(
42
+ "es", help=f"The language of the text. Options: {LANGUAGES}"
43
+ ),
44
+ doc_type: str = typer.Option(
45
+ "sentence",
46
+ help=f"Whether to embed at the sentence or document level. Options: {DOCUMENT_TYPES}.",
47
+ ),
48
  sample: int = typer.Option(1000, help="Maximum number of examples to use."),
49
  dimensionality_reduction: str = typer.Option(
50
  DIMENSIONALITY_REDUCTION_ALGORITHMS[0],
51
  help=f"Whether to use UMAP or t-SNE for dimensionality reduction. Options: {DIMENSIONALITY_REDUCTION_ALGORITHMS}.",
52
  ),
53
+ model_name: str = typer.Option(
54
+ EMBEDDING_MODELS[0],
55
+ help=f"The sentence embedding model to use. Options: {EMBEDDING_MODELS}",
56
+ ),
57
+ output_file: str = typer.Option(
58
+ "perplexity.html", help="The name of the output visualization HTML file."
59
+ ),
60
  ):
61
  """
62
  Perplexity Lenses: Visualize text embeddings in 2D using colors to represent perplexity values.
 
64
  logger.info("Loading embedding model...")
65
  model = load_model(model_name)
66
  dimensionality_reduction_function = (
67
+ partial(get_umap_embeddings, random_state=SEED)
68
+ if dimensionality_reduction.lower() == "umap"
69
+ else partial(get_tsne_embeddings, random_state=SEED)
70
  )
71
  logger.info("Loading KenLM model...")
72
  kenlm_model = KenlmModel.from_pretrained(language)
73
  logger.info("Loading dataset...")
74
  if dataset.endswith(".csv") or dataset.endswith(".tsv"):
75
+ df = pd.read_csv(dataset, sep="\t" if dataset.endswith(".tsv") else ",")
76
  if doc_type.lower() == "sentence":
77
  df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED)
78
  df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity)
79
  else:
80
+ df = hub_dataset_to_dataframe(
81
+ dataset,
82
+ dataset_config,
83
+ dataset_split,
84
+ sample,
85
+ text_column,
86
+ kenlm_model,
87
+ seed=SEED,
88
+ doc_type=doc_type,
89
+ )
90
+ # Round perplexity
91
+ df["perplexity"] = df["perplexity"].round().astype(int)
92
+ logger.info(
93
+ f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}"
94
+ )
95
+ plot = generate_plot(
96
+ df,
97
+ text_column,
98
+ "perplexity",
99
+ None,
100
+ dimensionality_reduction_function,
101
+ model,
102
+ seed=SEED,
103
+ )
104
+ logger.info("Saving plot")
105
+ bokeh_output_file(output_file)
106
+ save(plot)
107
+ logger.info("Done")
108
 
109
 
110
  if __name__ == "__main__":
perplexity_lenses/data.py CHANGED
@@ -9,7 +9,14 @@ from perplexity_lenses.perplexity import KenlmModel
9
 
10
 
11
  def hub_dataset_to_dataframe(
12
- path: str, name: str, split: str, sample: int, text_column: str, model: KenlmModel, seed: int = 0, doc_type: str = "Whole document"
 
 
 
 
 
 
 
13
  ) -> pd.DataFrame:
14
  load_dataset_fn = partial(load_dataset, path=path)
15
  if name:
@@ -18,9 +25,19 @@ def hub_dataset_to_dataframe(
18
  load_dataset_fn = partial(load_dataset_fn, split=split)
19
  dataset = load_dataset_fn(streaming=True).shuffle(buffer_size=10000, seed=seed)
20
  if doc_type.lower() == "sentence":
21
- dataset = dataset.map(lambda x: [{text_column: sentence, "perplexity": model.get_perplexity(sentence)} for sentence in x[text_column].split("\n")])
 
 
 
 
 
22
  else:
23
- dataset = dataset.map(lambda x: {text_column: x[text_column], "perplexity": model.get_perplexity(x[text_column])})
 
 
 
 
 
24
  instances = []
25
  count = 0
26
  for instance in tqdm(dataset, total=sample):
@@ -38,6 +55,14 @@ def hub_dataset_to_dataframe(
38
  return pd.DataFrame(instances)
39
 
40
 
41
- def documents_df_to_sentences_df(df: pd.DataFrame, text_column: str, sample: int, seed: int = 0):
42
- df_sentences = pd.DataFrame({text_column: np.array(df[text_column].map(lambda x: x.split("\n")).values.tolist()).flatten()})
 
 
 
 
 
 
 
 
43
  return df_sentences.sample(min(sample, df_sentences.shape[0]), random_state=seed)
 
9
 
10
 
11
  def hub_dataset_to_dataframe(
12
+ path: str,
13
+ name: str,
14
+ split: str,
15
+ sample: int,
16
+ text_column: str,
17
+ model: KenlmModel,
18
+ seed: int = 0,
19
+ doc_type: str = "Whole document",
20
  ) -> pd.DataFrame:
21
  load_dataset_fn = partial(load_dataset, path=path)
22
  if name:
 
25
  load_dataset_fn = partial(load_dataset_fn, split=split)
26
  dataset = load_dataset_fn(streaming=True).shuffle(buffer_size=10000, seed=seed)
27
  if doc_type.lower() == "sentence":
28
+ dataset = dataset.map(
29
+ lambda x: [
30
+ {text_column: sentence, "perplexity": model.get_perplexity(sentence)}
31
+ for sentence in x[text_column].split("\n")
32
+ ]
33
+ )
34
  else:
35
+ dataset = dataset.map(
36
+ lambda x: {
37
+ text_column: x[text_column],
38
+ "perplexity": model.get_perplexity(x[text_column]),
39
+ }
40
+ )
41
  instances = []
42
  count = 0
43
  for instance in tqdm(dataset, total=sample):
 
55
  return pd.DataFrame(instances)
56
 
57
 
58
+ def documents_df_to_sentences_df(
59
+ df: pd.DataFrame, text_column: str, sample: int, seed: int = 0
60
+ ):
61
+ df_sentences = pd.DataFrame(
62
+ {
63
+ text_column: np.array(
64
+ df[text_column].map(lambda x: x.split("\n")).values.tolist()
65
+ ).flatten()
66
+ }
67
+ )
68
  return df_sentences.sample(min(sample, df_sentences.shape[0]), random_state=seed)
perplexity_lenses/engine.py CHANGED
@@ -96,7 +96,9 @@ def generate_plot(
96
  context_logger: Union[st.spinner, ContextLogger] = ContextLogger,
97
  ) -> Figure:
98
  if text_column not in df.columns:
99
- raise ValueError(f"The specified column name doesn't exist. Columns available: {df.columns.values}")
 
 
100
  if label_column not in df.columns:
101
  df[label_column] = 0
102
  df = df.dropna(subset=[text_column, label_column])
@@ -110,6 +112,12 @@ def generate_plot(
110
  embeddings_2d = dimensionality_reduction_function(embeddings)
111
  logger.info("Generating figure")
112
  plot = draw_interactive_scatter_plot(
113
- df[text_column].values, embeddings_2d[:, 0], embeddings_2d[:, 1], encoded_labels.values, df[label_column].values, text_column, label_column
 
 
 
 
 
 
114
  )
115
  return plot
 
96
  context_logger: Union[st.spinner, ContextLogger] = ContextLogger,
97
  ) -> Figure:
98
  if text_column not in df.columns:
99
+ raise ValueError(
100
+ f"The specified column name doesn't exist. Columns available: {df.columns.values}"
101
+ )
102
  if label_column not in df.columns:
103
  df[label_column] = 0
104
  df = df.dropna(subset=[text_column, label_column])
 
112
  embeddings_2d = dimensionality_reduction_function(embeddings)
113
  logger.info("Generating figure")
114
  plot = draw_interactive_scatter_plot(
115
+ df[text_column].values,
116
+ embeddings_2d[:, 0],
117
+ embeddings_2d[:, 1],
118
+ encoded_labels.values,
119
+ df[label_column].values,
120
+ text_column,
121
+ label_column,
122
  )
123
  return plot
perplexity_lenses/perplexity.py CHANGED
@@ -5,6 +5,21 @@ import urllib.request
5
  from typing import Dict
6
 
7
  import kenlm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  class KenlmModel:
@@ -46,32 +61,42 @@ class KenlmModel:
46
  "►": "-",
47
  }
48
  unicode_punct_re = re.compile(f"[{''.join(unicode_punct.keys())}]")
49
- non_printing_chars_re = re.compile(f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]")
 
 
50
 
51
  def __init__(self, language):
52
  download_kenlm_model(language)
53
  try:
54
  self.model = kenlm.Model(f"{language}.arpa.bin")
 
55
  except OSError:
56
  os.remove(f"{language}.arpa.bin")
57
  if os.path.exists(f"{language}.sp.model"):
58
  os.remove(f"{language}.sp.model")
59
- raise OSError("File was corrupt and should have been removed. Please, retry.")
 
 
60
 
61
  @classmethod
62
  def from_pretrained(cls, language: str):
63
  return cls(language)
64
 
 
 
 
65
  def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
66
  if normalize_cc_net:
67
  doc = self.normalize(doc)
 
 
68
  doc_log_score, doc_length = 0, 0
69
  for line in doc.split("\n"):
70
  log_score = self.model.score(line)
71
  length = len(line.split()) + 1
72
  doc_log_score += log_score
73
  doc_length += length
74
- return 10.0 ** (-doc_log_score / doc_length)
75
 
76
  def normalize(
77
  self,
@@ -106,7 +131,7 @@ class KenlmModel:
106
  return "".join(output)
107
 
108
  def replace_unicode_punct(self, text: str) -> str:
109
- return "".join((self.unicode_punct.get(c, c) for c in text))
110
 
111
  def remove_unicode_punct(self, text: str) -> str:
112
  """More aggressive version of replace_unicode_punct but also faster."""
 
5
  from typing import Dict
6
 
7
  import kenlm
8
+ import sentencepiece
9
+
10
+
11
+ class SentencePiece:
12
+ def __init__(
13
+ self,
14
+ model: str,
15
+ ):
16
+ super().__init__()
17
+ self.sp = sentencepiece.SentencePieceProcessor()
18
+ self.sp.load(str(model))
19
+
20
+ def do(self, text: dict) -> dict:
21
+ tokenized = self.sp.encode_as_pieces(text)
22
+ return " ".join(tokenized)
23
 
24
 
25
  class KenlmModel:
 
61
  "►": "-",
62
  }
63
  unicode_punct_re = re.compile(f"[{''.join(unicode_punct.keys())}]")
64
+ non_printing_chars_re = re.compile(
65
+ f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
66
+ )
67
 
68
  def __init__(self, language):
69
  download_kenlm_model(language)
70
  try:
71
  self.model = kenlm.Model(f"{language}.arpa.bin")
72
+ self.tokenizer = SentencePiece(f"{language}.sp.model")
73
  except OSError:
74
  os.remove(f"{language}.arpa.bin")
75
  if os.path.exists(f"{language}.sp.model"):
76
  os.remove(f"{language}.sp.model")
77
+ raise OSError(
78
+ "File was corrupt and should have been removed. Please, retry."
79
+ )
80
 
81
  @classmethod
82
  def from_pretrained(cls, language: str):
83
  return cls(language)
84
 
85
+ def pp(self, log_score, length):
86
+ return 10.0 ** (-log_score / length)
87
+
88
  def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
89
  if normalize_cc_net:
90
  doc = self.normalize(doc)
91
+ # Tokenize (after normalizing): See https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/mine.py#L352 for full pipeline
92
+ doc = self.tokenizer.do(doc)
93
  doc_log_score, doc_length = 0, 0
94
  for line in doc.split("\n"):
95
  log_score = self.model.score(line)
96
  length = len(line.split()) + 1
97
  doc_log_score += log_score
98
  doc_length += length
99
+ return round(self.pp(doc_log_score, doc_length), 1)
100
 
101
  def normalize(
102
  self,
 
131
  return "".join(output)
132
 
133
  def replace_unicode_punct(self, text: str) -> str:
134
+ return "".join(self.unicode_punct.get(c, c) for c in text)
135
 
136
  def remove_unicode_punct(self, text: str) -> str:
137
  """More aggressive version of replace_unicode_punct but also faster."""
perplexity_lenses/visualization.py CHANGED
@@ -6,7 +6,13 @@ from bokeh.transform import factor_cmap
6
 
7
 
8
  def draw_interactive_scatter_plot(
9
- texts: np.ndarray, xs: np.ndarray, ys: np.ndarray, values: np.ndarray, labels: np.ndarray, text_column: str, label_column: str
 
 
 
 
 
 
10
  ) -> Figure:
11
  # Smooth down values for coloring, by taking the entropy = log10(perplexity) and multiply it by 10000
12
  values = ((np.log10(values)) * 10000).round().astype(int)
@@ -16,17 +22,33 @@ def draw_interactive_scatter_plot(
16
  if max_value - min_value == 0:
17
  values_color = np.ones(len(values))
18
  else:
19
- values_color = ((values - min_value) / (max_value - min_value) * 255).round().astype(int)
 
 
20
  values_color_sorted = sorted(values_color)
21
 
22
  values_list = values.astype(str).tolist()
23
  values_sorted = sorted(values_list)
24
  labels_list = labels.astype(str).tolist()
25
 
26
- source = ColumnDataSource(data=dict(x=xs, y=ys, text=texts, label=values_list, original_label=labels_list))
27
- hover = HoverTool(tooltips=[(text_column, "@text{safe}"), (label_column, "@original_label")])
 
 
 
 
28
  p = figure(plot_width=800, plot_height=800, tools=[hover])
29
- p.circle("x", "y", size=10, source=source, fill_color=factor_cmap("label", palette=[Pallete[id_] for id_ in values_color_sorted], factors=values_sorted))
 
 
 
 
 
 
 
 
 
 
30
 
31
  p.axis.visible = False
32
  p.xgrid.grid_line_color = None
 
6
 
7
 
8
  def draw_interactive_scatter_plot(
9
+ texts: np.ndarray,
10
+ xs: np.ndarray,
11
+ ys: np.ndarray,
12
+ values: np.ndarray,
13
+ labels: np.ndarray,
14
+ text_column: str,
15
+ label_column: str,
16
  ) -> Figure:
17
  # Smooth down values for coloring, by taking the entropy = log10(perplexity) and multiply it by 10000
18
  values = ((np.log10(values)) * 10000).round().astype(int)
 
22
  if max_value - min_value == 0:
23
  values_color = np.ones(len(values))
24
  else:
25
+ values_color = (
26
+ ((values - min_value) / (max_value - min_value) * 255).round().astype(int)
27
+ )
28
  values_color_sorted = sorted(values_color)
29
 
30
  values_list = values.astype(str).tolist()
31
  values_sorted = sorted(values_list)
32
  labels_list = labels.astype(str).tolist()
33
 
34
+ source = ColumnDataSource(
35
+ data=dict(x=xs, y=ys, text=texts, label=values_list, original_label=labels_list)
36
+ )
37
+ hover = HoverTool(
38
+ tooltips=[(text_column, "@text{safe}"), (label_column, "@original_label")]
39
+ )
40
  p = figure(plot_width=800, plot_height=800, tools=[hover])
41
+ p.circle(
42
+ "x",
43
+ "y",
44
+ size=10,
45
+ source=source,
46
+ fill_color=factor_cmap(
47
+ "label",
48
+ palette=[Pallete[id_] for id_ in values_color_sorted],
49
+ factors=values_sorted,
50
+ ),
51
+ )
52
 
53
  p.axis.visible = False
54
  p.xgrid.grid_line_color = None
requirements.txt CHANGED
@@ -1,11 +1,11 @@
 
 
 
1
  huggingface-hub==0.0.19
 
 
2
  streamlit==1.1.0
3
  transformers==4.11.3
4
- watchdog==2.1.3
5
- sentence-transformers==2.0.0
6
- bokeh==2.2.2
7
  umap-learn==0.5.2
8
- numpy==1.20.0
9
- https://files.pythonhosted.org/packages/2f/58/e00d2495b54f4ba97ca31a11aa7e636f80183ccf9b616f7eaa5518d050bb/embedding_lenses-0.5.0-py3-none-any.whl
10
- https://github.com/kpu/kenlm/archive/master.zip
11
- typer==0.4.0
 
1
+ bokeh==2.2.2
2
+ https://files.pythonhosted.org/packages/2f/58/e00d2495b54f4ba97ca31a11aa7e636f80183ccf9b616f7eaa5518d050bb/embedding_lenses-0.5.0-py3-none-any.whl
3
+ https://github.com/kpu/kenlm/archive/master.zip
4
  huggingface-hub==0.0.19
5
+ numpy==1.20.0
6
+ sentence-transformers==2.0.0
7
  streamlit==1.1.0
8
  transformers==4.11.3
9
+ typer==0.4.0
 
 
10
  umap-learn==0.5.2
11
+ watchdog==2.1.3
 
 
 
tests/test_data.py CHANGED
@@ -10,4 +10,6 @@ class TestData(unittest.TestCase):
10
  input_df = pd.DataFrame({"text": ["foo\nbar"]})
11
  expected_output_df = pd.DataFrame({"text": ["foo", "bar"]})
12
  output_df = documents_df_to_sentences_df(input_df, "text", 100)
13
- pd.testing.assert_frame_equal(output_df, expected_output_df, check_like=True, check_exact=True)
 
 
 
10
  input_df = pd.DataFrame({"text": ["foo\nbar"]})
11
  expected_output_df = pd.DataFrame({"text": ["foo", "bar"]})
12
  output_df = documents_df_to_sentences_df(input_df, "text", 100)
13
+ pd.testing.assert_frame_equal(
14
+ output_df, expected_output_df, check_like=True, check_exact=True
15
+ )