########################### # Baseline ATB Newswire Datasets # # This file creates the three data sets used in the current # line of Arabic parsing research: # # (1) Raw (no Bies mapping) / Unvocalized ("Raw") # (2) Bies + DT / Unvocalized ("Unvoc") # (3) Bies + DT / Vocalized ("Voc") # (4) Bies + DT / Unvocalized ("NoDashTags") # -No traces or phrasal tag decorations. For training the Berkeley parser. # # Note that "Bies + DT" refers to the enhancement to the Bies mappings # proposed by Kulick et al. (2006). # # The training/dev/test set is the "Mona Diab split" from the 2005 JHU # workshop on parsing Arabic dialects (Chiang et al., 2006). # # # IMPORTANT: All paths should reference the base Arabic data directory # # /u/nlp/data/Arabic # ########################### NAME=1 Raw Train TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train OUTPUT_ENCODING=UTF8 FLAT=true ;; NAME=1 Raw Dev TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev OUTPUT_ENCODING=UTF8 FLAT=true ;; NAME=1 Raw Test TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test OUTPUT_ENCODING=UTF8 FLAT=true ;; NAME=2 Unvoc All TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel OUTPUT_ENCODING=UTF8 MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp USEDET=true ;; NAME=2 Unvoc Train TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train OUTPUT_ENCODING=UTF8 MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp USEDET=true ;; NAME=2 Unvoc Dev TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev OUTPUT_ENCODING=UTF8 MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp USEDET=true ;; NAME=2 Unvoc Test TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test OUTPUT_ENCODING=UTF8 MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp USEDET=true ;; NAME=3 Voc Train TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/with-vowel PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/with-vowel PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/with-vowel SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train OUTPUT_ENCODING=UTF8 MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp USEDET=true LEXMAPPER=edu.stanford.nlp.international.arabic.pipeline.UnvocLexicalMapper FLAT=true ;; NAME=3 Voc Dev TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/with-vowel PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/with-vowel PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/with-vowel SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev OUTPUT_ENCODING=UTF8 MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp USEDET=true LEXMAPPER=edu.stanford.nlp.international.arabic.pipeline.UnvocLexicalMapper FLAT=true ;; NAME=3 Voc Test TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/with-vowel PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/with-vowel PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/with-vowel SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test OUTPUT_ENCODING=UTF8 MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp USEDET=true LEXMAPPER=edu.stanford.nlp.international.arabic.pipeline.UnvocLexicalMapper FLAT=true ;; NAME=4 Unvoc Train NoDashTags TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train OUTPUT_ENCODING=UTF8 MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp NODASHTAGS=true ADDROOT=true USEDET=true ;; NAME=4 Unvoc Dev NoDashTags TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev OUTPUT_ENCODING=UTF8 MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp NODASHTAGS=true ADDROOT=true USEDET=true ;; NAME=4 Unvoc Test NoDashTags TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test OUTPUT_ENCODING=UTF8 MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp NODASHTAGS=true ADDROOT=true USEDET=true ;;