Tokenizer

Browse files

Files changed (6) hide show

added_tokens.json +237 -0
merges.txt +0 -0
special_tokens_map.json +15 -0
tokenizer.json +0 -0
tokenizer_config.json +65 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,237 @@

+{
+  "+/-": 50403,
+  "2021": 50328,
+  "2030": 50417,
+  "2050": 50487,
+  "CH4": 50354,
+  "CO2": 50265,
+  "Committee": 50410,
+  "GHG": 50397,
+  "N2O": 50382,
+  "achieve": 50437,
+  "across": 50277,
+  "activities": 50317,
+  "adaptation": 50377,
+  "addition": 50303,
+  "additional": 50444,
+  "affect": 50438,
+  "agreement": 50464,
+  "agricultural": 50395,
+  "already": 50435,
+  "analyses": 50485,
+  "annual": 50287,
+  "applied": 50360,
+  "approach": 50290,
+  "areas": 50275,
+  "assess": 50430,
+  "assessment": 50345,
+  "atmosphere": 50341,
+  "atmospheric": 50310,
+  "basis": 50372,
+  "become": 50394,
+  "benefits": 50402,
+  "biomass": 50314,
+  "capture": 50379,
+  "caused": 50389,
+  "challenges": 50404,
+  "characteristics": 50420,
+  "climatic": 50319,
+  "coastal": 50393,
+  "combined": 50451,
+  "communities": 50318,
+  "companies": 50312,
+  "compared": 50282,
+  "composition": 50418,
+  "concentration": 50357,
+  "concentrations": 50366,
+  "conditions": 50272,
+  "conducted": 50439,
+  "conservation": 50458,
+  "considered": 50339,
+  "consistent": 50494,
+  "construction": 50445,
+  "consumption": 50307,
+  "contribute": 50498,
+  "contribution": 50468,
+  "costs": 50326,
+  "countries": 50283,
+  "crisis": 50499,
+  "customers": 50358,
+  "decades": 50495,
+  "decision": 50452,
+  "decrease": 50367,
+  "decreased": 50396,
+  "degrees": 50276,
+  "delta": 50433,
+  "determine": 50480,
+  "determined": 50455,
+  "developing": 50412,
+  "differences": 50375,
+  "dioxide": 50346,
+  "distribution": 50296,
+  "diversity": 50431,
+  "drought": 50336,
+  "dynamics": 50390,
+  "ecological": 50401,
+  "economy": 50338,
+  "ecosystem": 50333,
+  "ecosystems": 50384,
+  "efforts": 50392,
+  "electricity": 50315,
+  "emission": 50279,
+  "emissions": 50266,
+  "employees": 50419,
+  "ensure": 50449,
+  "environmental": 50269,
+  "estimate": 50459,
+  "estimated": 50362,
+  "estimates": 50408,
+  "factors": 50292,
+  "findings": 50492,
+  "flood": 50363,
+  "flux": 50476,
+  "fluxes": 50491,
+  "following": 50369,
+  "forests": 50406,
+  "fossil": 50348,
+  "fuels": 50424,
+  "further": 50301,
+  "gases": 50471,
+  "greater": 50368,
+  "greenhouse": 50289,
+  "however": 50454,
+  "hydrogen": 50376,
+  "identify": 50460,
+  "impacts": 50281,
+  "importance": 50414,
+  "included": 50434,
+  "increase": 50271,
+  "increased": 50274,
+  "increases": 50361,
+  "indicate": 50388,
+  "industry": 50306,
+  "influence": 50329,
+  "infrastructure": 50425,
+  "integrated": 50483,
+  "investigated": 50385,
+  "investment": 50321,
+  "investments": 50466,
+  "least": 50429,
+  "losses": 50462,
+  "mainly": 50399,
+  "materials": 50398,
+  "means": 50486,
+  "measured": 50364,
+  "measurements": 50422,
+  "methane": 50359,
+  "methods": 50378,
+  "mitigation": 50457,
+  "moisture": 50493,
+  "monitoring": 50465,
+  "nitrogen": 50405,
+  "northern": 50446,
+  "observations": 50461,
+  "observed": 50293,
+  "obtained": 50391,
+  "ocean": 50347,
+  "operating": 50440,
+  "operations": 50374,
+  "opportunities": 50469,
+  "overall": 50426,
+  "pandemic": 50490,
+  "parameters": 50383,
+  "particular": 50413,
+  "patterns": 50349,
+  "performed": 50497,
+  "periods": 50432,
+  "planning": 50473,
+  "plans": 50423,
+  "plants": 50313,
+  "policies": 50355,
+  "pollution": 50467,
+  "populations": 50441,
+  "possible": 50332,
+  "potential": 50273,
+  "practices": 50453,
+  "precipitation": 50280,
+  "presented": 50428,
+  "previous": 50482,
+  "processes": 50291,
+  "productivity": 50463,
+  "proposed": 50370,
+  "provide": 50285,
+  "provides": 50373,
+  "rainfall": 50323,
+  "ratio": 50416,
+  "recovery": 50450,
+  "reduce": 50288,
+  "reduced": 50327,
+  "reducing": 50381,
+  "reduction": 50286,
+  "regional": 50308,
+  "regions": 50302,
+  "relationship": 50400,
+  "relatively": 50484,
+  "renewable": 50294,
+  "requirements": 50477,
+  "respectively": 50316,
+  "responses": 50427,
+  "resulting": 50456,
+  "risks": 50309,
+  "scenario": 50421,
+  "scenarios": 50334,
+  "seasonal": 50411,
+  "sediment": 50475,
+  "several": 50342,
+  "shares": 50474,
+  "showed": 50304,
+  "significantly": 50299,
+  "simulations": 50470,
+  "snow": 50496,
+  "soil": 50270,
+  "soils": 50448,
+  "solar": 50320,
+  "solutions": 50351,
+  "sources": 50331,
+  "southern": 50481,
+  "spatial": 50322,
+  "statements": 50472,
+  "strategies": 50387,
+  "strategy": 50353,
+  "structure": 50337,
+  "studied": 50443,
+  "studies": 50297,
+  "summer": 50335,
+  "supply": 50311,
+  "sustainability": 50325,
+  "sustainable": 50284,
+  "systems": 50278,
+  "targets": 50436,
+  "technologies": 50343,
+  "temperature": 50268,
+  "temperatures": 50295,
+  "temporal": 50479,
+  "thermal": 50365,
+  "towards": 50409,
+  "transition": 50344,
+  "transport": 50352,
+  "trees": 50478,
+  "trend": 50407,
+  "trends": 50371,
+  "tropical": 50415,
+  "uncertainty": 50489,
+  "understanding": 50356,
+  "variability": 50298,
+  "variables": 50447,
+  "variation": 50380,
+  "variations": 50442,
+  "various": 50340,
+  "vegetation": 50330,
+  "waste": 50350,
+  "yield": 50386,
+  "–": 50300,
+  "’": 50267,
+  "“": 50305,
+  "”": 50488,
+  "•": 50324
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 512,
+  "name_or_path": "language_model/model/ClimateBERT_21072022_acc_grad_roberta",
+  "pad_token": {
+    "__type": "AddedToken",
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "special_tokens_map_file": "pre_model/21072022_roberta/special_tokens_map.json",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff