lvwerra HF staff commited on
Commit
0804d15
1 Parent(s): 0756f70

Update Space (evaluate main: 1ead4793)

Browse files
Files changed (4) hide show
  1. README.md +74 -6
  2. app.py +6 -0
  3. requirements.txt +3 -0
  4. word_count.py +64 -0
README.md CHANGED
@@ -1,12 +1,80 @@
1
  ---
2
- title: Word_count
3
- emoji: 📈
4
- colorFrom: gray
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 3.0.6
8
  app_file: app.py
9
  pinned: false
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Word Count
3
+ emoji: 🤗
4
+ colorFrom: green
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 3.0.2
8
  app_file: app.py
9
  pinned: false
10
+ tags:
11
+ - evaluate
12
+ - measurement
13
  ---
14
 
15
+ # Measurement Card for Word Count
16
+
17
+ ## Measurement Description
18
+
19
+ The `word_count` measurement returns the total number of word count of the input string, using the sklearn's [`CountVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
20
+
21
+ ## How to Use
22
+
23
+ This measurement requires a list of strings as input:
24
+
25
+ ```python
26
+ >>> data = ["hello world and hello moon"]
27
+ >>> wordcount= evaluate.load("word_count")
28
+ >>> results = wordcount.compute(data=data)
29
+ ```
30
+
31
+ ### Inputs
32
+ - **data** (list of `str`): The input list of strings for which the word length is calculated.
33
+ - **max_vocab** (`int`): (optional) the top number of words to consider (can be specified if dataset is too large)
34
+
35
+ ### Output Values
36
+ - **total_word_count** (`int`): the total number of words in the input string(s).
37
+ - **unique_words** (`int`): the number of unique words in the input string(s).
38
+
39
+ Output Example(s):
40
+
41
+ ```python
42
+ {'total_word_count': 5, 'unique_words': 4}
43
+
44
+
45
+ ### Examples
46
+
47
+ Example for a single string
48
+
49
+ ```python
50
+ >>> data = ["hello sun and goodbye moon"]
51
+ >>> wordcount = evaluate.load("word_count")
52
+ >>> results = wordcount.compute(data=data)
53
+ >>> print(results)
54
+ {'total_word_count': 5, 'unique_words': 5}
55
+ ```
56
+
57
+ Example for a multiple strings
58
+ ```python
59
+ >>> data = ["hello sun and goodbye moon", "foo bar foo bar"]
60
+ >>> wordcount = evaluate.load("word_count")
61
+ >>> results = wordcount.compute(data=data)
62
+ >>> print(results)
63
+ {'total_word_count': 9, 'unique_words': 7}
64
+ ```
65
+
66
+ Example for a dataset from 🤗 Datasets:
67
+
68
+ ```python
69
+ >>> imdb = datasets.load_dataset('imdb', split = 'train')
70
+ >>> wordcount = evaluate.load("word_count")
71
+ >>> results = wordcount.compute(data=imdb['text'])
72
+ >>> print(results)
73
+ {'total_word_count': 5678573, 'unique_words': 74849}
74
+ ```
75
+
76
+ ## Citation(s)
77
+
78
+
79
+ ## Further References
80
+ - [Sklearn `CountVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+
5
+ module = evaluate.load("word_count", type="measurement")
6
+ launch_gradio_widget(module)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git+https://github.com/huggingface/evaluate.git@main
2
+ datasets~=2.0
3
+ sklearn~=1.1.1
word_count.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import evaluate
16
+ import datasets
17
+ from sklearn.feature_extraction.text import CountVectorizer
18
+
19
+ _DESCRIPTION = """
20
+ Returns the total number of words, and the number of unique words in the input data.
21
+ """
22
+
23
+ _KWARGS_DESCRIPTION = """
24
+ Args:
25
+ `data`: a list of `str` for which the words are counted.
26
+ `max_vocab` (optional): the top number of words to consider (can be specified if dataset is too large)
27
+
28
+ Returns:
29
+ `total_word_count` (`int`) : the total number of words in the input string(s)
30
+ `unique_words` (`int`) : the number of unique words in the input list of strings.
31
+
32
+ Examples:
33
+ >>> data = ["hello world and hello moon"]
34
+ >>> wordcount= evaluate.load("word_count")
35
+ >>> results = wordcount.compute(data=data)
36
+ >>> print(results)
37
+ {'total_word_count': 5, 'unique_words': 4}
38
+ """
39
+ _CITATION = ""
40
+
41
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
42
+ class WordCount(evaluate.EvaluationModule):
43
+ """This measurement returns the total number of words and the number of unique words
44
+ in the input string(s)."""
45
+
46
+ def _info(self):
47
+ return evaluate.EvaluationModuleInfo(
48
+ # This is the description that will appear on the modules page.
49
+ module_type="measurement",
50
+ description=_DESCRIPTION,
51
+ citation = _CITATION,
52
+ inputs_description=_KWARGS_DESCRIPTION,
53
+ features=datasets.Features({
54
+ 'data': datasets.Value('string'),
55
+ })
56
+ )
57
+
58
+ def _compute(self, data, max_vocab = None):
59
+ """Returns the number of unique words in the input data"""
60
+ count_vectorizer = CountVectorizer(max_features=max_vocab)
61
+ document_matrix = count_vectorizer.fit_transform(data)
62
+ word_count = document_matrix.sum()
63
+ unique_words = document_matrix.shape[1]
64
+ return {"total_word_count": word_count, "unique_words": unique_words}