Spaces:
Sleeping
Sleeping
########################### | |
# Baseline ATB Newswire Datasets | |
# | |
# This file creates the three data sets used in the current | |
# line of Arabic parsing research: | |
# | |
# (1) Raw (no Bies mapping) / Unvocalized ("Raw") | |
# (2) Bies + DT / Unvocalized ("Unvoc") | |
# (3) Bies + DT / Vocalized ("Voc") | |
# (4) Bies + DT / Unvocalized ("NoDashTags") | |
# -No traces or phrasal tag decorations. For training the Berkeley parser. | |
# | |
# Note that "Bies + DT" refers to the enhancement to the Bies mappings | |
# proposed by Kulick et al. (2006). | |
# | |
# The training/dev/test set is the "Mona Diab split" from the 2005 JHU | |
# workshop on parsing Arabic dialects (Chiang et al., 2006). | |
# | |
# | |
# IMPORTANT: All paths should reference the base Arabic data directory | |
# | |
# /u/nlp/data/Arabic | |
# | |
########################### | |
NAME=1 Raw Train | |
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset | |
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel | |
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel | |
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel | |
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train | |
OUTPUT_ENCODING=UTF8 | |
FLAT=true | |
;; | |
NAME=1 Raw Dev | |
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset | |
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel | |
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel | |
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel | |
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev | |
OUTPUT_ENCODING=UTF8 | |
FLAT=true | |
;; | |
NAME=1 Raw Test | |
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset | |
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel | |
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel | |
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel | |
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test | |
OUTPUT_ENCODING=UTF8 | |
FLAT=true | |
;; | |
NAME=2 Unvoc All | |
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset | |
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel | |
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel | |
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel | |
OUTPUT_ENCODING=UTF8 | |
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp | |
USEDET=true | |
;; | |
NAME=2 Unvoc Train | |
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset | |
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel | |
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel | |
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel | |
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train | |
OUTPUT_ENCODING=UTF8 | |
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp | |
USEDET=true | |
;; | |
NAME=2 Unvoc Dev | |
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset | |
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel | |
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel | |
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel | |
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev | |
OUTPUT_ENCODING=UTF8 | |
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp | |
USEDET=true | |
;; | |
NAME=2 Unvoc Test | |
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset | |
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel | |
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel | |
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel | |
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test | |
OUTPUT_ENCODING=UTF8 | |
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp | |
USEDET=true | |
;; | |
NAME=3 Voc Train | |
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset | |
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/with-vowel | |
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/with-vowel | |
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/with-vowel | |
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train | |
OUTPUT_ENCODING=UTF8 | |
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp | |
USEDET=true | |
LEXMAPPER=edu.stanford.nlp.international.arabic.pipeline.UnvocLexicalMapper | |
FLAT=true | |
;; | |
NAME=3 Voc Dev | |
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset | |
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/with-vowel | |
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/with-vowel | |
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/with-vowel | |
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev | |
OUTPUT_ENCODING=UTF8 | |
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp | |
USEDET=true | |
LEXMAPPER=edu.stanford.nlp.international.arabic.pipeline.UnvocLexicalMapper | |
FLAT=true | |
;; | |
NAME=3 Voc Test | |
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset | |
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/with-vowel | |
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/with-vowel | |
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/with-vowel | |
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test | |
OUTPUT_ENCODING=UTF8 | |
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp | |
USEDET=true | |
LEXMAPPER=edu.stanford.nlp.international.arabic.pipeline.UnvocLexicalMapper | |
FLAT=true | |
;; | |
NAME=4 Unvoc Train NoDashTags | |
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset | |
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel | |
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel | |
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel | |
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train | |
OUTPUT_ENCODING=UTF8 | |
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp | |
NODASHTAGS=true | |
ADDROOT=true | |
USEDET=true | |
;; | |
NAME=4 Unvoc Dev NoDashTags | |
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset | |
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel | |
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel | |
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel | |
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev | |
OUTPUT_ENCODING=UTF8 | |
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp | |
NODASHTAGS=true | |
ADDROOT=true | |
USEDET=true | |
;; | |
NAME=4 Unvoc Test NoDashTags | |
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset | |
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel | |
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel | |
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel | |
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test | |
OUTPUT_ENCODING=UTF8 | |
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp | |
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp | |
NODASHTAGS=true | |
ADDROOT=true | |
USEDET=true | |
;; | |