victormiller commited on
Commit
11009b9
1 Parent(s): c6e3afb

Update common.py

Browse files
Files changed (1) hide show
  1. common.py +7 -2
common.py CHANGED
@@ -37,7 +37,12 @@ dask.bag.from_sequence(doc_file_paths)
37
  .map_partitions(make_doc_pairs)
38
  .compute()
39
  """
40
- email_code = "r&quot[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:(?:[A-Za-z0-9](?:[" r"A-Za-z0-9-]*[A-Za-z0-9])?\.)+[A-Za-z0-9](?:[A-Za-z0-9-]*[A-Za-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[&quot r&quot01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[A-Za-z0-9-]*[A-Za-z0-9]:)])"
 
 
 
 
 
41
 
42
  global_div = Div(
43
  Section(
@@ -98,7 +103,7 @@ global_div = Div(
98
  H3("Removing PII"),
99
  P("We have removed two types of PII from the dataset: email address and IP address. Regular expressions are used to identify and replace these PII with a generic placeholder. Below is an example of how we removed email addresses from the dataset:"),
100
  P("We have used the following regular expressions to identify and replace PII:"),
101
- Ul(Li("Email: NEED TO UPDATE"),Li("IP Address: NEED TO UPDATE")),
102
  ),
103
  Section(
104
  H2("Normalization Form C (NFC)"),
 
37
  .map_partitions(make_doc_pairs)
38
  .compute()
39
  """
40
+ email_code = """
41
+ r"[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:(?:[A-Za-z0-9]
42
+ (?:["r"A-Za-z0-9-]*[A-Za-z0-9])?\.)+[A-Za-z0-9](?:[A-Za-z0-9-]*[A-Za-z0-9])?|\[(?:(?:25
43
+ [0-5]|2[0-4][0-9]|[&quot r&quot01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?
44
+ |[A-Za-z0-9-]*[A-Za-z0-9]:)])
45
+ """
46
 
47
  global_div = Div(
48
  Section(
 
103
  H3("Removing PII"),
104
  P("We have removed two types of PII from the dataset: email address and IP address. Regular expressions are used to identify and replace these PII with a generic placeholder. Below is an example of how we removed email addresses from the dataset:"),
105
  P("We have used the following regular expressions to identify and replace PII:"),
106
+ Ul(Li("Email:"), Li(email_code, style="list-style-type: none")Li("IP Address: NEED TO UPDATE")),
107
  ),
108
  Section(
109
  H2("Normalization Form C (NFC)"),