mohdelgaar commited on
Commit
b028d48
1 Parent(s): 674b430

upload lng

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. lng/L2SCA/LICENSE.txt +339 -0
  2. lng/L2SCA/Makefile +12 -0
  3. lng/L2SCA/README-L2SCA.txt +94 -0
  4. lng/L2SCA/README-gui.txt +206 -0
  5. lng/L2SCA/README-tregex.txt +429 -0
  6. lng/L2SCA/README-tsurgeon.txt +529 -0
  7. lng/L2SCA/Semgrex.ppt +0 -0
  8. lng/L2SCA/analyzeFolder.py +148 -0
  9. lng/L2SCA/analyzeText.py +146 -0
  10. lng/L2SCA/examples/atree +1 -0
  11. lng/L2SCA/examples/exciseNP +6 -0
  12. lng/L2SCA/examples/relabelWithGroupName +4 -0
  13. lng/L2SCA/examples/renameVerb +3 -0
  14. lng/L2SCA/lib/ABOUT-AppleJavaExtensions.txt +29 -0
  15. lng/L2SCA/lib/AppleJavaExtensions.jar +3 -0
  16. lng/L2SCA/lib/README-AppleJavaExtensions.txt +46 -0
  17. lng/L2SCA/run-tregex-gui.bat +1 -0
  18. lng/L2SCA/run-tregex-gui.command +2 -0
  19. lng/L2SCA/samples/my_sample.txt +1 -0
  20. lng/L2SCA/samples/sample1.txt +10 -0
  21. lng/L2SCA/samples/sample1_output +2 -0
  22. lng/L2SCA/samples/sample2.txt +1 -0
  23. lng/L2SCA/samples/samples_output +3 -0
  24. lng/L2SCA/stanford-parser-full-2014-01-04/LICENSE.txt +339 -0
  25. lng/L2SCA/stanford-parser-full-2014-01-04/Makefile +13 -0
  26. lng/L2SCA/stanford-parser-full-2014-01-04/ParserDemo.java +100 -0
  27. lng/L2SCA/stanford-parser-full-2014-01-04/ParserDemo2.java +88 -0
  28. lng/L2SCA/stanford-parser-full-2014-01-04/README.txt +280 -0
  29. lng/L2SCA/stanford-parser-full-2014-01-04/README_dependencies.txt +194 -0
  30. lng/L2SCA/stanford-parser-full-2014-01-04/StanfordDependenciesManual.pdf +0 -0
  31. lng/L2SCA/stanford-parser-full-2014-01-04/bin/makeSerialized.csh +242 -0
  32. lng/L2SCA/stanford-parser-full-2014-01-04/bin/run-tb-preproc +65 -0
  33. lng/L2SCA/stanford-parser-full-2014-01-04/build.xml +190 -0
  34. lng/L2SCA/stanford-parser-full-2014-01-04/conf/atb-latest.conf +209 -0
  35. lng/L2SCA/stanford-parser-full-2014-01-04/conf/ftb-latest.conf +44 -0
  36. lng/L2SCA/stanford-parser-full-2014-01-04/data/arabic-onesent-utf8.txt +1 -0
  37. lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-gb18030.txt +1 -0
  38. lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-unseg-gb18030.txt +1 -0
  39. lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-unseg-utf8.txt +1 -0
  40. lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-utf8.txt +1 -0
  41. lng/L2SCA/stanford-parser-full-2014-01-04/data/english-onesent.txt +1 -0
  42. lng/L2SCA/stanford-parser-full-2014-01-04/data/french-onesent.txt +1 -0
  43. lng/L2SCA/stanford-parser-full-2014-01-04/data/german-onesent.txt +1 -0
  44. lng/L2SCA/stanford-parser-full-2014-01-04/data/pos-sentences.txt +7 -0
  45. lng/L2SCA/stanford-parser-full-2014-01-04/data/testsent.txt +10 -0
  46. lng/L2SCA/stanford-parser-full-2014-01-04/ejml-0.23.jar +3 -0
  47. lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-gui.bat +3 -0
  48. lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-gui.command +13 -0
  49. lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-gui.sh +13 -0
  50. lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-lang-train-test.sh +50 -0
lng/L2SCA/LICENSE.txt ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GNU GENERAL PUBLIC LICENSE
2
+ Version 2, June 1991
3
+
4
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
5
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6
+ Everyone is permitted to copy and distribute verbatim copies
7
+ of this license document, but changing it is not allowed.
8
+
9
+ Preamble
10
+
11
+ The licenses for most software are designed to take away your
12
+ freedom to share and change it. By contrast, the GNU General Public
13
+ License is intended to guarantee your freedom to share and change free
14
+ software--to make sure the software is free for all its users. This
15
+ General Public License applies to most of the Free Software
16
+ Foundation's software and to any other program whose authors commit to
17
+ using it. (Some other Free Software Foundation software is covered by
18
+ the GNU Lesser General Public License instead.) You can apply it to
19
+ your programs, too.
20
+
21
+ When we speak of free software, we are referring to freedom, not
22
+ price. Our General Public Licenses are designed to make sure that you
23
+ have the freedom to distribute copies of free software (and charge for
24
+ this service if you wish), that you receive source code or can get it
25
+ if you want it, that you can change the software or use pieces of it
26
+ in new free programs; and that you know you can do these things.
27
+
28
+ To protect your rights, we need to make restrictions that forbid
29
+ anyone to deny you these rights or to ask you to surrender the rights.
30
+ These restrictions translate to certain responsibilities for you if you
31
+ distribute copies of the software, or if you modify it.
32
+
33
+ For example, if you distribute copies of such a program, whether
34
+ gratis or for a fee, you must give the recipients all the rights that
35
+ you have. You must make sure that they, too, receive or can get the
36
+ source code. And you must show them these terms so they know their
37
+ rights.
38
+
39
+ We protect your rights with two steps: (1) copyright the software, and
40
+ (2) offer you this license which gives you legal permission to copy,
41
+ distribute and/or modify the software.
42
+
43
+ Also, for each author's protection and ours, we want to make certain
44
+ that everyone understands that there is no warranty for this free
45
+ software. If the software is modified by someone else and passed on, we
46
+ want its recipients to know that what they have is not the original, so
47
+ that any problems introduced by others will not reflect on the original
48
+ authors' reputations.
49
+
50
+ Finally, any free program is threatened constantly by software
51
+ patents. We wish to avoid the danger that redistributors of a free
52
+ program will individually obtain patent licenses, in effect making the
53
+ program proprietary. To prevent this, we have made it clear that any
54
+ patent must be licensed for everyone's free use or not licensed at all.
55
+
56
+ The precise terms and conditions for copying, distribution and
57
+ modification follow.
58
+
59
+ GNU GENERAL PUBLIC LICENSE
60
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61
+
62
+ 0. This License applies to any program or other work which contains
63
+ a notice placed by the copyright holder saying it may be distributed
64
+ under the terms of this General Public License. The "Program", below,
65
+ refers to any such program or work, and a "work based on the Program"
66
+ means either the Program or any derivative work under copyright law:
67
+ that is to say, a work containing the Program or a portion of it,
68
+ either verbatim or with modifications and/or translated into another
69
+ language. (Hereinafter, translation is included without limitation in
70
+ the term "modification".) Each licensee is addressed as "you".
71
+
72
+ Activities other than copying, distribution and modification are not
73
+ covered by this License; they are outside its scope. The act of
74
+ running the Program is not restricted, and the output from the Program
75
+ is covered only if its contents constitute a work based on the
76
+ Program (independent of having been made by running the Program).
77
+ Whether that is true depends on what the Program does.
78
+
79
+ 1. You may copy and distribute verbatim copies of the Program's
80
+ source code as you receive it, in any medium, provided that you
81
+ conspicuously and appropriately publish on each copy an appropriate
82
+ copyright notice and disclaimer of warranty; keep intact all the
83
+ notices that refer to this License and to the absence of any warranty;
84
+ and give any other recipients of the Program a copy of this License
85
+ along with the Program.
86
+
87
+ You may charge a fee for the physical act of transferring a copy, and
88
+ you may at your option offer warranty protection in exchange for a fee.
89
+
90
+ 2. You may modify your copy or copies of the Program or any portion
91
+ of it, thus forming a work based on the Program, and copy and
92
+ distribute such modifications or work under the terms of Section 1
93
+ above, provided that you also meet all of these conditions:
94
+
95
+ a) You must cause the modified files to carry prominent notices
96
+ stating that you changed the files and the date of any change.
97
+
98
+ b) You must cause any work that you distribute or publish, that in
99
+ whole or in part contains or is derived from the Program or any
100
+ part thereof, to be licensed as a whole at no charge to all third
101
+ parties under the terms of this License.
102
+
103
+ c) If the modified program normally reads commands interactively
104
+ when run, you must cause it, when started running for such
105
+ interactive use in the most ordinary way, to print or display an
106
+ announcement including an appropriate copyright notice and a
107
+ notice that there is no warranty (or else, saying that you provide
108
+ a warranty) and that users may redistribute the program under
109
+ these conditions, and telling the user how to view a copy of this
110
+ License. (Exception: if the Program itself is interactive but
111
+ does not normally print such an announcement, your work based on
112
+ the Program is not required to print an announcement.)
113
+
114
+ These requirements apply to the modified work as a whole. If
115
+ identifiable sections of that work are not derived from the Program,
116
+ and can be reasonably considered independent and separate works in
117
+ themselves, then this License, and its terms, do not apply to those
118
+ sections when you distribute them as separate works. But when you
119
+ distribute the same sections as part of a whole which is a work based
120
+ on the Program, the distribution of the whole must be on the terms of
121
+ this License, whose permissions for other licensees extend to the
122
+ entire whole, and thus to each and every part regardless of who wrote it.
123
+
124
+ Thus, it is not the intent of this section to claim rights or contest
125
+ your rights to work written entirely by you; rather, the intent is to
126
+ exercise the right to control the distribution of derivative or
127
+ collective works based on the Program.
128
+
129
+ In addition, mere aggregation of another work not based on the Program
130
+ with the Program (or with a work based on the Program) on a volume of
131
+ a storage or distribution medium does not bring the other work under
132
+ the scope of this License.
133
+
134
+ 3. You may copy and distribute the Program (or a work based on it,
135
+ under Section 2) in object code or executable form under the terms of
136
+ Sections 1 and 2 above provided that you also do one of the following:
137
+
138
+ a) Accompany it with the complete corresponding machine-readable
139
+ source code, which must be distributed under the terms of Sections
140
+ 1 and 2 above on a medium customarily used for software interchange; or,
141
+
142
+ b) Accompany it with a written offer, valid for at least three
143
+ years, to give any third party, for a charge no more than your
144
+ cost of physically performing source distribution, a complete
145
+ machine-readable copy of the corresponding source code, to be
146
+ distributed under the terms of Sections 1 and 2 above on a medium
147
+ customarily used for software interchange; or,
148
+
149
+ c) Accompany it with the information you received as to the offer
150
+ to distribute corresponding source code. (This alternative is
151
+ allowed only for noncommercial distribution and only if you
152
+ received the program in object code or executable form with such
153
+ an offer, in accord with Subsection b above.)
154
+
155
+ The source code for a work means the preferred form of the work for
156
+ making modifications to it. For an executable work, complete source
157
+ code means all the source code for all modules it contains, plus any
158
+ associated interface definition files, plus the scripts used to
159
+ control compilation and installation of the executable. However, as a
160
+ special exception, the source code distributed need not include
161
+ anything that is normally distributed (in either source or binary
162
+ form) with the major components (compiler, kernel, and so on) of the
163
+ operating system on which the executable runs, unless that component
164
+ itself accompanies the executable.
165
+
166
+ If distribution of executable or object code is made by offering
167
+ access to copy from a designated place, then offering equivalent
168
+ access to copy the source code from the same place counts as
169
+ distribution of the source code, even though third parties are not
170
+ compelled to copy the source along with the object code.
171
+
172
+ 4. You may not copy, modify, sublicense, or distribute the Program
173
+ except as expressly provided under this License. Any attempt
174
+ otherwise to copy, modify, sublicense or distribute the Program is
175
+ void, and will automatically terminate your rights under this License.
176
+ However, parties who have received copies, or rights, from you under
177
+ this License will not have their licenses terminated so long as such
178
+ parties remain in full compliance.
179
+
180
+ 5. You are not required to accept this License, since you have not
181
+ signed it. However, nothing else grants you permission to modify or
182
+ distribute the Program or its derivative works. These actions are
183
+ prohibited by law if you do not accept this License. Therefore, by
184
+ modifying or distributing the Program (or any work based on the
185
+ Program), you indicate your acceptance of this License to do so, and
186
+ all its terms and conditions for copying, distributing or modifying
187
+ the Program or works based on it.
188
+
189
+ 6. Each time you redistribute the Program (or any work based on the
190
+ Program), the recipient automatically receives a license from the
191
+ original licensor to copy, distribute or modify the Program subject to
192
+ these terms and conditions. You may not impose any further
193
+ restrictions on the recipients' exercise of the rights granted herein.
194
+ You are not responsible for enforcing compliance by third parties to
195
+ this License.
196
+
197
+ 7. If, as a consequence of a court judgment or allegation of patent
198
+ infringement or for any other reason (not limited to patent issues),
199
+ conditions are imposed on you (whether by court order, agreement or
200
+ otherwise) that contradict the conditions of this License, they do not
201
+ excuse you from the conditions of this License. If you cannot
202
+ distribute so as to satisfy simultaneously your obligations under this
203
+ License and any other pertinent obligations, then as a consequence you
204
+ may not distribute the Program at all. For example, if a patent
205
+ license would not permit royalty-free redistribution of the Program by
206
+ all those who receive copies directly or indirectly through you, then
207
+ the only way you could satisfy both it and this License would be to
208
+ refrain entirely from distribution of the Program.
209
+
210
+ If any portion of this section is held invalid or unenforceable under
211
+ any particular circumstance, the balance of the section is intended to
212
+ apply and the section as a whole is intended to apply in other
213
+ circumstances.
214
+
215
+ It is not the purpose of this section to induce you to infringe any
216
+ patents or other property right claims or to contest validity of any
217
+ such claims; this section has the sole purpose of protecting the
218
+ integrity of the free software distribution system, which is
219
+ implemented by public license practices. Many people have made
220
+ generous contributions to the wide range of software distributed
221
+ through that system in reliance on consistent application of that
222
+ system; it is up to the author/donor to decide if he or she is willing
223
+ to distribute software through any other system and a licensee cannot
224
+ impose that choice.
225
+
226
+ This section is intended to make thoroughly clear what is believed to
227
+ be a consequence of the rest of this License.
228
+
229
+ 8. If the distribution and/or use of the Program is restricted in
230
+ certain countries either by patents or by copyrighted interfaces, the
231
+ original copyright holder who places the Program under this License
232
+ may add an explicit geographical distribution limitation excluding
233
+ those countries, so that distribution is permitted only in or among
234
+ countries not thus excluded. In such case, this License incorporates
235
+ the limitation as if written in the body of this License.
236
+
237
+ 9. The Free Software Foundation may publish revised and/or new versions
238
+ of the General Public License from time to time. Such new versions will
239
+ be similar in spirit to the present version, but may differ in detail to
240
+ address new problems or concerns.
241
+
242
+ Each version is given a distinguishing version number. If the Program
243
+ specifies a version number of this License which applies to it and "any
244
+ later version", you have the option of following the terms and conditions
245
+ either of that version or of any later version published by the Free
246
+ Software Foundation. If the Program does not specify a version number of
247
+ this License, you may choose any version ever published by the Free Software
248
+ Foundation.
249
+
250
+ 10. If you wish to incorporate parts of the Program into other free
251
+ programs whose distribution conditions are different, write to the author
252
+ to ask for permission. For software which is copyrighted by the Free
253
+ Software Foundation, write to the Free Software Foundation; we sometimes
254
+ make exceptions for this. Our decision will be guided by the two goals
255
+ of preserving the free status of all derivatives of our free software and
256
+ of promoting the sharing and reuse of software generally.
257
+
258
+ NO WARRANTY
259
+
260
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261
+ FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
262
+ OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263
+ PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264
+ OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
266
+ TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
267
+ PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268
+ REPAIR OR CORRECTION.
269
+
270
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272
+ REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273
+ INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274
+ OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275
+ TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276
+ YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277
+ PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278
+ POSSIBILITY OF SUCH DAMAGES.
279
+
280
+ END OF TERMS AND CONDITIONS
281
+
282
+ How to Apply These Terms to Your New Programs
283
+
284
+ If you develop a new program, and you want it to be of the greatest
285
+ possible use to the public, the best way to achieve this is to make it
286
+ free software which everyone can redistribute and change under these terms.
287
+
288
+ To do so, attach the following notices to the program. It is safest
289
+ to attach them to the start of each source file to most effectively
290
+ convey the exclusion of warranty; and each file should have at least
291
+ the "copyright" line and a pointer to where the full notice is found.
292
+
293
+ <one line to give the program's name and a brief idea of what it does.>
294
+ Copyright (C) <year> <name of author>
295
+
296
+ This program is free software; you can redistribute it and/or modify
297
+ it under the terms of the GNU General Public License as published by
298
+ the Free Software Foundation; either version 2 of the License, or
299
+ (at your option) any later version.
300
+
301
+ This program is distributed in the hope that it will be useful,
302
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
303
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
304
+ GNU General Public License for more details.
305
+
306
+ You should have received a copy of the GNU General Public License along
307
+ with this program; if not, write to the Free Software Foundation, Inc.,
308
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309
+
310
+ Also add information on how to contact you by electronic and paper mail.
311
+
312
+ If the program is interactive, make it output a short notice like this
313
+ when it starts in an interactive mode:
314
+
315
+ Gnomovision version 69, Copyright (C) year name of author
316
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317
+ This is free software, and you are welcome to redistribute it
318
+ under certain conditions; type `show c' for details.
319
+
320
+ The hypothetical commands `show w' and `show c' should show the appropriate
321
+ parts of the General Public License. Of course, the commands you use may
322
+ be called something other than `show w' and `show c'; they could even be
323
+ mouse-clicks or menu items--whatever suits your program.
324
+
325
+ You should also get your employer (if you work as a programmer) or your
326
+ school, if any, to sign a "copyright disclaimer" for the program, if
327
+ necessary. Here is a sample; alter the names:
328
+
329
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
331
+
332
+ <signature of Ty Coon>, 1 April 1989
333
+ Ty Coon, President of Vice
334
+
335
+ This General Public License does not permit incorporating your program into
336
+ proprietary programs. If your program is a subroutine library, you may
337
+ consider it more useful to permit linking proprietary applications with the
338
+ library. If this is what you want to do, use the GNU Lesser General
339
+ Public License instead of this License.
lng/L2SCA/Makefile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is a rudimentary Makefile for rebuilding the tregex distribution.
2
+ # We actually use ant (q.v.) or a Java IDE.
3
+
4
+ JAVAC = javac
5
+ JAVAFLAGS = -O -d classes -encoding utf-8
6
+
7
+ tregex:
8
+ mkdir -p classes
9
+ $(JAVAC) -classpath CLASSPATH:lib/AppleJavaExtensions.jar $(JAVAFLAGS) src/edu/stanford/nlp/*/*.java src/edu/stanford/nlp/*/*/*.java src/edu/stanford/nlp/*/*/*/*.java
10
+ cd classes ; jar -cfm ../stanford-tregex-`date +%Y-%m-%d`.jar ../src/edu/stanford/nlp/trees/tregex/gui/tregex-manifest.txt edu ; cd ..
11
+ cp stanford-tregex-`date +%Y-%m-%d`.jar stanford-tregex.jar
12
+
lng/L2SCA/README-L2SCA.txt ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This code is the L2 Syntactic Complexity Analyzer described in:
2
+
3
+ Lu, Xiaofei (2010). Automatic analysis of syntactic complexity in second language writing. International Journal of Corpus Linguistics, 15(4):474-496.
4
+
5
+ Version 3.3.3, released June 30, 2016
6
+
7
+ Copyright (C) 2016 Xiaofei Lu
8
+
9
+ This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12
+
13
+ You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14
+
15
+ To download the latest version of this software, follow the appropriate link at
16
+ http://www.personal.psu.edu/xxl13/download.html
17
+
18
+ ==============================================================================
19
+ ABOUT
20
+
21
+ L2 Syntactic Complexity Analyzer is designed to automate syntactic complexity analysis of written English language samples produced by advanced learners of English using fourteen different measures proposed in the second language development literature. The analyzer takes a written English language sample in plain text format as input and generates 14 indices of syntactic complexity of the sample.
22
+
23
+ The analyzer is implemented in python and runs on UNIX-like (LINUX, MAC OS, or UNIX) systems with Java 1.5 and python 2.5 or higher installed. The analyzer takes as input a plain text file, counts the frequency of the following 9 structures in the text: words (W), sentences (S), verb phrases (VP), clauses (C), T-units (T), dependent clauses (DC), complex T-units (CT), coordinate phrases (CP), and complex nominals (CN), and computes the following 14 syntactic complexity indices of the text: mean length of sentence (MLS), mean length of T-unit (MLT), mean length of clause (MLC), clauses per sentence (C/S), verb phrases per T-unit (VP/T),, clauses per T-unit (C/T), dependent clauses per clause (DC/C), dependent clauses per T-unit (DC/T), T-units per sentence (T/S), complex T-unit ratio (CT/T), coordinate phrases per T-unit (CP/T), coordinate phrases per clause (CP/C), complex nominals per T-unit (CN/T), and complex nominals per clause (CP/C).
24
+
25
+ The analyzer calls the Stanford praser (Klein & Manning, 2003) to parse the input file and Tregex (Levy and Andrew, 2006) to query the parse trees. Both the Stanford parser and Tregex are bundled in this download and installation along with the appropriate licenses.
26
+
27
+ CONTENTS
28
+
29
+ [1] Running the single file analyzer
30
+ [2] Input format
31
+ [3] Output format
32
+ [4] Running the multiple file analyzer
33
+ [5] A list of the files included in this package
34
+
35
+ ==============================================================================
36
+ [1] Running the single file analyzer
37
+
38
+ To run the single file analyzer, type the following at the command line:
39
+
40
+ python analyzeText.py <input_file> <output_file>
41
+
42
+ Note that the python script should be called from within this directory. To make sure everything runs correctly, run the following and compare your output with the sample1_output file in the samples/ subdirectory.
43
+
44
+ python analyzeText.py samples/sample1.txt samples/sample1_testing
45
+ ==============================================================================
46
+ [2] Input format
47
+
48
+ The input file should be a clean English text in plain text format (with a .txt suffix in the filename). Sample files can be found in the "samples" sub-directory.
49
+
50
+ ==============================================================================
51
+ [3] Output format
52
+
53
+ A name of the output file must be provided, but you can name it anything you like.
54
+
55
+ The first line in the output file is a comma-delimited list of 24 fields, including Filename, abbreviations of the 9 structures mentioned above, and abbreviations of the 14 syntactic complexity indices mentioned above.
56
+
57
+ The second line (and subsequent lines if analyzing multiple files in a directory) is a comma-delimited list of 24 values, including the name of the input file, the frequency counts of the 9 structures, and the values of the 14 syntactic complexity indices.
58
+
59
+ This format may be hard to read but allows for easy import to Excel or SPSS.
60
+
61
+ ==============================================================================
62
+ [4] Running the multiple file analyzer
63
+
64
+ To run the multiple file analyzer, type the following at the command line:
65
+
66
+ python analyzeFolder.py <path_to_input_file_folder> <output_file>
67
+
68
+ path_to_input_file_folder is the path to the folder or directory that contains the text files you want to analyze (e.g., /home/inputFiles/). The path should end with a slash, as in the example. Only files that end with the .txt suffix will be analyzed.
69
+
70
+ Note that the python script should be called from within this directory. To make sure everything runs correctly, run the following and compare your output with the samples_output file in the samples/ subdirectory.
71
+
72
+ python analyzeFolder.py samples/ samples/samples_testing
73
+ ==============================================================================
74
+ [5] A list of the files included in this package
75
+
76
+ README-L2SCA.txt - this file
77
+
78
+ analyzeText.py - the single file analyzer
79
+
80
+ analyzeFolder.py - the multiple file analyzer
81
+
82
+ samples/ - this directory includes the following sample files:
83
+
84
+ sample1.txt: an English text in plain text format
85
+
86
+ sample2.txt: another English text in plain text format
87
+
88
+ sample1_output: sample output file generated by the single file analyzer
89
+
90
+ samples_output: sample output file generated by the multiple file analyzer
91
+
92
+ All files for Tregex 3.3.1
93
+
94
+ Stanford parser 3.3.1
lng/L2SCA/README-gui.txt ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Tregex GUI v3.3.1 - 2014-01-04
2
+ ----------------------------------------------
3
+
4
+ Copyright (c) 2003-2012 The Board of Trustees of
5
+ The Leland Stanford Junior University. All Rights Reserved.
6
+
7
+ Original core Tregex code by Roger Levy and Galen Andrew.
8
+ Original core Tsurgeon code by Roger Levy.
9
+ TregexGUI by Anna Rafferty
10
+ Support code, additional features, etc. by Chris Manning
11
+ This release prepared by John Bauer.
12
+
13
+ ----------------------------
14
+ TREGEX GRAPHICAL USER INTERFACE (GUI) README
15
+ ----------------------------
16
+
17
+ The Tregex GUI is a graphical user interface for Tregex and Tsurgeon.
18
+ You can access it by double- clicking on the jar file tregex.jar. For
19
+ searching large treebanks, you may need to use more memory; the script
20
+ run-tregex-gui.command includes this allocation of memory and can be run
21
+ from the command line or double-click to run on a Mac. If you still have
22
+ memory problems, you can allot more memory by opening the script in a
23
+ text editor and changing "-mx300m" to include a bigger number (e.g.,
24
+ "-mx512m"). Tregex requires Java 1.5+. Further documentation for
25
+ Tregex and Tsurgeon can be found in README-tregex.txt and
26
+ README-tsurgeon.txt, respectively.
27
+
28
+ ----------------------------
29
+ LOADING TREEBANKS/TREE FILES
30
+ ----------------------------
31
+
32
+ To load a file containing Penn Treebank formatted trees, choose "Load trees..." from the file menu.
33
+ Multiple tree files and/or directories may be selected. After selecting the tree files you wish to
34
+ load, press "Load with file filters" to choose what filters you would like to apply to the files.
35
+ All filters are run based on the name of the file. Possible filtering options are:
36
+
37
+ - Prefix: Load only files that start with the given character sequence
38
+
39
+ - Extension: Load only files that end with the given character sequence
40
+
41
+ - Has number in range: Loads only numbered files such that the number falls in the given range, inclusive.
42
+ Ranges can be disjoint as long as multiple ranges are comma-separated (e.g., "100-500,550-700")
43
+
44
+ File filters are combined such that all loaded files must obey all of
45
+ the filters; only one filter of any given type should be specified.
46
+
47
+ Once the tree files are loaded, their names appear in the upper left hand panel "Tree files:".
48
+ Unchecking the check boxes next to the files causes the unchecked files not to be included in
49
+ searches/tsurgeon operations. To remove all files from the tree panel, choose "Clear all files"
50
+ from the Edit menu.
51
+
52
+ ----------------------------
53
+ PERFORMING TREGEX SEARCHES
54
+ ----------------------------
55
+
56
+ To perform a Tregex search, load the files you would like to search and type a Tregex pattern
57
+ in the "Pattern:" box in the top middle of the window. Press "Help" beneath the Pattern box
58
+ for information about Tregex syntax. After you have typed the pattern, press "Search" to
59
+ find all matches to the given pattern.
60
+
61
+ By default, trees that contain at least one match are displayed in the "Matches:" panel in the
62
+ top right of the window, and the first matching tree is graphically displayed in the bottom
63
+ portion of the window. Click on a match in the Match panel to display it graphically. In the
64
+ graphical display, matched nodes in the tree are displayed in a different color than other nodes.
65
+ To display only the matched subtrees, choose "Preferences..." (Mac, from the Application menu) or
66
+ "Options..." (other OS, under Tools), and check "Show only matched portions of the tree". You must
67
+ rerun the search to switch between showing only matched portions and showing full trees.
68
+
69
+ In preferences, other display options can also be set, such as the colors, size, and font used by
70
+ the graphical display.
71
+
72
+ ----------------------------
73
+ USING TSURGEON
74
+ ----------------------------
75
+
76
+ Tsurgeon modifications can also be performed using Interactive Tregex. To enable Tsurgeon, choose
77
+ "Preferences..." from the File menu and check "Enable Tsurgeon". You can now run Tsurgeon scripts.
78
+ Tsurgeon commands must be paired with a Tregex pattern that names the nodes on which modifications
79
+ will be performed. Type the Tregex pattern in the Pattern box, and type the modifications you would
80
+ like to make in the "Tsurgeon script:" box. Then click "Run script" to perform the modifications.
81
+ Each Tsurgeon operation must appear on a separate line in the Tsurgeon script box. Press "Help" for
82
+ some information about Tsurgeon operation syntax.
83
+
84
+
85
+ ----------------------------
86
+ SAVING RESULTS
87
+ ----------------------------
88
+
89
+ You can save the results of a Tregex search or Tsurgeon operation by choosing "Save matches..." from the
90
+ File menu. This saves all trees in the Matches panel in Penn Treebank form. "Save matched sentences..." saves
91
+ the matches in sentence String form, just as they show up in the matches panel.
92
+
93
+ You can also save a log of the number of matches found for each pattern you have searched. By clicking the
94
+ "Statistics" button in the middle of the screen, below the Tsurgeon buttons, you can see a table of the patterns
95
+ for which you have searched, the number of trees that each matched, and the number of overall matches that were
96
+ found. To save this information in a tab delimited text file, choose "Save statistics..." from the File menu.
97
+
98
+ All three save options save files in the encoding specified in the Preferences panel for loading tree files.
99
+
100
+ ----------------------------
101
+ MULTILANGUAGE SUPPORT
102
+ ----------------------------
103
+
104
+ Some multilanguage support is built into Tregex GUI, and most languages can be read by the GUI. To enable
105
+ this support, choose go to Preferences (Mac, under the application menu) or Options (other OS, under the Tools menu).
106
+ Several options may need to be changed: tree reader factory, head finder, font, and encoding. Several possible
107
+ tree reader factories and head finders are provided; you may also specify your own. Two common languages you may be
108
+ trying to use are Chinese or Arabic; any head finder or tree reader factory beginning with "Chinese" or "Arabic" will
109
+ work for these languages, and additionally, CTBTreeReaderFactory is compatible with many Chinese treebanks. Based on
110
+ your choice of head finder and tree reader factory, the Tregex GUI will guess if you may need a different font and/or
111
+ text encoding. If a different text encoding is usually used for your selections, you will be prompted as to what text
112
+ encoding you would like to use. This may also be specified directly in the Preferences panel.
113
+
114
+
115
+
116
+ ----------------------------
117
+ QUESTIONS
118
+ ----------------------------
119
+
120
+ For more information on Tregex or Tsurgeon, read README-tregex.txt and README-tsurgeon.txt, and also look at the javadocs
121
+ suggested in those files. For questions about this distribution, please contact Stanford's JavaNLP group at
122
+ [email protected]. We provide assistance on a best-effort basis.
123
+
124
+ ----------------------------
125
+ LICENSE
126
+ ----------------------------
127
+
128
+ Tregex GUI
129
+ Copyright (c) 2007-2011 The Board of Trustees of
130
+ The Leland Stanford Junior University. All Rights Reserved.
131
+
132
+ This program is free software; you can redistribute it and/or
133
+ modify it under the terms of the GNU General Public License
134
+ as published by the Free Software Foundation; either version 2
135
+ of the License, or (at your option) any later version.
136
+
137
+ This program is distributed in the hope that it will be useful,
138
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
139
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
140
+ GNU General Public License for more details.
141
+
142
+ You should have received a copy of the GNU General Public License
143
+ along with this program; if not, write to the Free Software
144
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
145
+
146
+ For more information, bug reports, fixes, contact:
147
+ Christopher Manning
148
+ Dept of Computer Science, Gates 1A
149
+ Stanford CA 94305-9010
150
+ USA
151
152
+ http://www-nlp.stanford.edu/software/tregex.shtml
153
+
154
+
155
+ ----------------------------
156
+ CHANGES
157
+ ----------------------------
158
+
159
+ 2014-01-04 3.3.1 Bugfix release, new createSubtree tsurgeon
160
+ operation
161
+
162
+ 2013-11-12 3.3.0 Allow a TregexMatcher to have its own
163
+ HeadFinder, useful for the dependencies
164
+
165
+ 2013-06-19 3.2.0 Fix for tsurgeon number reading bug
166
+
167
+ 2013-04-04 2.0.6 Update to maintain compatibility
168
+
169
+ 2012-11-11 2.0.5 Efficiency improvements
170
+
171
+ 2012-07-09 2.0.4 Minor bug fixes
172
+
173
+ 2012-05-22 2.0.3 Rebuilt to be compatible with everything.
174
+
175
+ 2012-03-09 2.0.2 Efficiency improvements
176
+
177
+ 2011-12-16 2.0.1 Fix bug in matchesAt, fix bug in category
178
+ function, add macros
179
+
180
+ 2011-09-14 2.0.0 Efficiency improvements, include semgrex.
181
+
182
+ 2011-05-15 1.4.4 Rebuilt to be compatible with everything.
183
+
184
+ 2011-05-15 1.4.3 Rebuilt to be compatible with everything.
185
+
186
+ 2011-04-17 1.4.2 Rebuilt to be compatible with tagger, parser,
187
+ and corenlp.
188
+
189
+ 2010-11-18 1.4.1 Small fixes and improvements (improved help
190
+ screens, multipattern Tsurgeon scripts with
191
+ comments introduced by % supported, unclosed
192
+ regex no longer crashes GUI, support character
193
+ encodings in script files, fix bug in tregex
194
+ matching immediate domination path, TregexGUI
195
+ now shows filename and line number of each
196
+ match in matches panel)
197
+
198
+ 2009-09-30 1.4 GUI slider for tree size, generalized relabel
199
+ command (incompatibly), __ and @ now supported
200
+ in path constraints; bugfixes.
201
+
202
+ 2008-05-06 1.1 Several bug fixes; addition of browse trees
203
+ function, improved copy/paste and drag and
204
+ drop support; misc. feature additions
205
+
206
+ 2007-09-20 1.0 Initial release
lng/L2SCA/README-tregex.txt ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Tregex v3.3.1 - 2014-01-04
2
+ ----------------------------------------------
3
+
4
+ Copyright (c) 2003-2012 The Board of Trustees of
5
+ The Leland Stanford Junior University. All Rights Reserved.
6
+
7
+ Original core Tregex code by Roger Levy and Galen Andrew.
8
+ Original core Tsurgeon code by Roger Levy.
9
+ GUI by Anna Rafferty
10
+ Support code, additional features, etc. by Chris Manning
11
+ This release prepared by John Bauer.
12
+
13
+ This package contains Tregex and Tsurgeon.
14
+
15
+ Tregex is a Tgrep2-style utility for matching patterns in trees. It can
16
+ be run in a graphical user interface, from the command line using the
17
+ TregexPattern main method, or used programmatically in java code via the
18
+ TregexPattern, TregexMatcher and TregexPatternCompiler classes.
19
+
20
+ As of version 1.2, the Tsurgeon tree-transformation utility is bundled
21
+ together with Tregex. See the file README.tsurgeon for details.
22
+
23
+ Java version 1.6 is required to use Tregex. If you really want to use
24
+ Tregex under an earlier version of Java, look into RetroWeaver:
25
+
26
+ http://retroweaver.sourceforge.net/
27
+
28
+
29
+ QUICKSTART
30
+ -----------------------------------------------
31
+
32
+ Programmatic use, command-line use, and GUI-use are supported. To access the
33
+ graphical interface for Tsurgeon and Tregex, double-click the tregex.jar file.
34
+ Some help (particularly with syntax) is provided within the program; for further
35
+ assistance, see README-gui.txt and the documentation mentioned below.
36
+
37
+ A full explanation of pattern syntax and usage is given in the javadocs
38
+ (particularly TregexPattern), and some of this information is also presented in
39
+ the TREGEX SYNTAX section below. As a quick example of usage,
40
+ the following line will scan an English PennTreebank annotated corpus
41
+ and print all nodes representing a verb phrase dominating a past-tense
42
+ verb and a noun phrase.
43
+
44
+ ./tregex.sh 'VP < VBD < NP' corpus_dir
45
+
46
+
47
+ CONTENTS
48
+ -----------------------------------------------
49
+
50
+ README-tregex.txt
51
+
52
+ This file.
53
+
54
+
55
+ README-tsurgeon.txt
56
+
57
+ Documentation for Tsurgeon, a tool for modifying trees.
58
+
59
+ README-gui.txt
60
+
61
+ Documentation for the graphical interface for Tregex and Tsurgeon tools.
62
+
63
+ LICENSE.txt
64
+
65
+ Tregex is licensed under the GNU General Public License.
66
+
67
+ stanford-tregex.jar
68
+
69
+ This is a JAR file containing all the Stanford classes necessary to
70
+ run tregex.
71
+
72
+ src
73
+
74
+ A directory containing the Java 1.6 source code for the Tregex
75
+ distribution.
76
+
77
+ javadoc
78
+
79
+ Javadocs for the distribution. In particular, look at the javadocs
80
+ for the class edu.stanford.nlp.trees.tregex.TregexPattern. The
81
+ first part of that class's javadoc describes syntax and semantics
82
+ for relations, node labels, node names, and variable groups. The
83
+ docs for the main method describe command-line options.
84
+
85
+ tregex.sh
86
+
87
+ a shell script for invoking the Tregex tree search tool.
88
+
89
+ tsurgeon.sh
90
+
91
+ a shell script for invoking the Tsurgeon tree transformation tool.
92
+
93
+ run-tregex-gui.command
94
+
95
+ A command file that can be double-clicked on a Mac to start the gui.
96
+
97
+ run-tregex-gui.bat
98
+
99
+ A bat file that can be double-clicked on a PC to start the gui.
100
+
101
+ examples
102
+
103
+ a directory containing several sample files to show Tsurgeon operation:
104
+ - atree
105
+ a sample natural-language tree in Penn Treebank annotation style.
106
+ - exciseNP
107
+ - renameVerb
108
+ - relabelWithGroupName
109
+ Sample tree-transformation operation files for Tsurgeon. See
110
+ README-tsurgeon.txt for more information about the contents of these
111
+ files.
112
+
113
+
114
+ TREGEX
115
+ -----------------------------------------------
116
+ Tregex Pattern Syntax and Uses
117
+
118
+ Using a Tregex pattern, you can find only those trees that match the pattern you're
119
+ looking for. The following table shows the symbols that are allowed in the pattern,
120
+ and below there is more information about using these patterns.
121
+
122
+ Table of Symbols and Meanings:
123
+ A << B
124
+ A dominates B
125
+ A >> B
126
+ A is dominated by B
127
+ A < B
128
+ A immediately dominates B
129
+ A > B
130
+ A is immediately dominated by B
131
+ A $ B
132
+ A is a sister of B (and not equal to B)
133
+ A .. B
134
+ A precedes B
135
+ A . B
136
+ A immediately precedes B
137
+ A ,, B
138
+ A follows B
139
+ A , B
140
+ A immediately follows B
141
+ A <<, B
142
+ B is a leftmost descendent of A
143
+ A <<- B
144
+ B is a rightmost descendent of A
145
+ A >>, B
146
+ A is a leftmost descendent of B
147
+ A >>- B
148
+ A is a rightmost descendent of B
149
+ A <, B
150
+ B is the first child of A
151
+ A >, B
152
+ A is the first child of B
153
+ A <- B
154
+ B is the last child of A
155
+ A >- B
156
+ A is the last child of B
157
+ A <` B
158
+ B is the last child of A
159
+ A >` B
160
+ A is the last child of B
161
+ A <i B
162
+ B is the ith child of A (i > 0)
163
+ A >i B
164
+ A is the ith child of B (i > 0)
165
+ A <-i B
166
+ B is the ith-to-last child of A (i > 0)
167
+ A >-i B
168
+ A is the ith-to-last child of B (i > 0)
169
+ A <: B
170
+ B is the only child of A
171
+ A >: B
172
+ A is the only child of B
173
+ A <<: B
174
+ A dominates B via an unbroken chain (length > 0) of unary local trees.
175
+ A >>: B
176
+ A is dominated by B via an unbroken chain (length > 0) of unary local trees.
177
+ A $++ B
178
+ A is a left sister of B (same as $.. for context-free trees)
179
+ A $-- B
180
+ A is a right sister of B (same as $,, for context-free trees)
181
+ A $+ B
182
+ A is the immediate left sister of B (same as $. for context-free trees)
183
+ A $- B
184
+ A is the immediate right sister of B (same as $, for context-free trees)
185
+ A $.. B
186
+ A is a sister of B and precedes B
187
+ A $,, B
188
+ A is a sister of B and follows B
189
+ A $. B
190
+ A is a sister of B and immediately precedes B
191
+ A $, B
192
+ A is a sister of B and immediately follows B
193
+ A <+(C) B
194
+ A dominates B via an unbroken chain of (zero or more) nodes matching description C
195
+ A >+(C) B
196
+ A is dominated by B via an unbroken chain of (zero or more) nodes matching description C
197
+ A .+(C) B
198
+ A precedes B via an unbroken chain of (zero or more) nodes matching description C
199
+ A ,+(C) B
200
+ A follows B via an unbroken chain of (zero or more) nodes matching description C
201
+ A <<# B
202
+ B is a head of phrase A
203
+ A >># B
204
+ A is a head of phrase B
205
+ A <# B
206
+ B is the immediate head of phrase A
207
+ A ># B
208
+ A is the immediate head of phrase B
209
+ A == B
210
+ A and B are the same node
211
+ A : B
212
+ [this is a pattern-segmenting operator that places no constraints on the relationship between A and B]
213
+
214
+ Label descriptions can be literal strings, which much match labels exactly, or regular
215
+ expressions in regular expression bars: /regex/. Literal string matching proceeds as
216
+ String equality. In order to prevent ambiguity with other Tregex symbols, only standard
217
+ "identifiers" are allowed as literals, i.e., strings matching [a-zA-Z]([a-zA-Z0-9_])* .
218
+ If you want to use other symbols, you can do so by using a regular expression instead of
219
+ a literal string. A disjunctive list of literal strings can be given separated by '|'.
220
+ The special string '__' (two underscores) can be used to match any node. (WARNING!!
221
+ Use of the '__' node description may seriously slow down search.) If a label description
222
+ is preceeded by '@', the label will match any node whose basicCategory matches the description.
223
+ NB: A single '@' thus scopes over a disjunction specified by '|': @NP|VP means things with basic category NP or VP.
224
+
225
+ Label description regular expressions are matched as find(), as in Perl/tgrep;
226
+ you need to specify ^ or $ to constrain matches.
227
+
228
+ In a chain of relations, all relations are relative to the first node in the chain.
229
+ For example, (S < VP < NP) means an S over a VP and also over an NP. If instead what
230
+ you want is an S above a VP above an NP, you should write S < (VP < NP).
231
+
232
+ Nodes can be grouped using parentheses '(' and ')' as in S < (NP $++ VP) to match an S
233
+ over an NP, where the NP has a VP as a right sister.
234
+
235
+ Boolean relational operators
236
+
237
+ Relations can be combined using the '&' and '|' operators, negated with the '!' operator,
238
+ and made optional with the '?' operator. Thus (NP < NN | < NNS) will match an NP node
239
+ dominating either an NN or an NNS. (NP > S & $++ VP) matches an NP that is both under
240
+ an S and has a VP as a right sister.
241
+
242
+ Relations can be grouped using brackets '[' and ']'. So the expression
243
+
244
+ NP [< NN | < NNS] & > S
245
+
246
+ matches an NP that (1) dominates either an NN or an NNS, and (2) is under an S. Without
247
+ brackets, & takes precedence over |, and equivalent operators are left-associative. Also
248
+ note that & is the default combining operator if the operator is omitted in a chain of
249
+ relations, so that the two patterns are equivalent:
250
+ (S < VP < NP)
251
+ (S < VP & < NP)
252
+
253
+ As another example, (VP < VV | < NP % NP) can be written explicitly as (VP [< VV | [< NP & % NP] ] ).
254
+
255
+ Relations can be negated with the '!' operator, in which case the expression will match
256
+ only if there is no node satisfying the relation. For example (NP !< NNP) matches only
257
+ NPs not dominating an NNP. Label descriptions can also be negated with '!': (NP < !NNP|NNS)
258
+ matches NPs dominating some node that is not an NNP or an NNS.
259
+
260
+ Relations can be made optional with the '?' operator. This way the expression will match even
261
+ if the optional relation is not satisfied. This is useful when used together with node naming
262
+ (see below).
263
+
264
+
265
+ Basic Categories
266
+
267
+ In order to consider only the "basic category" of a tree label, i.e. to ignore functional tags
268
+ or other annotations on the label, prefix that node's description with the @ symbol. For example
269
+ (@NP < @/NN.?/). This can only be used for individual nodes; if you want all nodes to use the
270
+ basic category, it would be more efficient to use a TreeNormalizer to remove functional tags
271
+ before passing the tree to the TregexPattern.
272
+
273
+
274
+ Segmenting patterns
275
+
276
+ The ":" operator allows you to segment a pattern into two pieces. This can simplify your pattern
277
+ writing. For example, the pattern S : NP matches only those S nodes in trees that also have an NP node.
278
+
279
+
280
+ Naming nodes
281
+
282
+ Nodes can be given names (a.k.a. handles) using '='. A named node will be stored in a map that
283
+ maps names to nodes so that if a match is found, the node corresponding to the named node can
284
+ be extracted from the map. For example (NP < NNP=name) will match an NP dominating an NNP
285
+ and after a match is found, the map can be queried with the name to retreived the matched node
286
+ using {@link TregexMatcher#getNode(Object o)} with (String) argument "name" (not "=name"). Note
287
+ that you are not allowed to name a node that is under the scope of a negation operator (the
288
+ semantics would be unclear, since you can't store a node that never gets matched to). Trying to
289
+ do so will cause a ParseException to be thrown. Named nodes can be put within the scope of an
290
+ optional operator.
291
+
292
+ Named nodes that refer back to previous named nodes need not have a node description -- this is
293
+ known as "backreferencing". In this case, the expression will match only when all instances of
294
+ the same name get matched to the same tree node. For example, the pattern:
295
+
296
+ (@NP <, (@NP $+ (/,/ $+ (@NP $+ /,/=comma))) <- =comma)
297
+
298
+ matches only an NP dominating exactly the sequence NP, NP; the mother NP cannot have any other
299
+ daughters. Multiple backreferences are allowed. If the node with no node description does not
300
+ refer to a previously named node, there will be no error, the expression simply will not match
301
+ anything.
302
+
303
+ Another way to refer to previously named nodes is with the "link" symbol: '~'. A link is like a
304
+ backreference, except that instead of having to be <i>equal to</i> the referred node, the
305
+ current node only has to match the label of the referred to node. A link cannot have a node
306
+ description, i.e. the '~' symbol must immediately follow a relation symbol.
307
+
308
+
309
+ Variable Groups
310
+
311
+ If you write a node description using a regular expression, you can assign its matching groups to
312
+ variable names. If more than one node has a group assigned to the same variable name, then matching
313
+ will only occur when all such groups capture the same string. This is useful for enforcing
314
+ coindexation constraints. The syntax is:
315
+
316
+ / <regex-stuff> /#<group-number>%<variable-name>
317
+
318
+ For example, the pattern (designed for Penn Treebank trees):
319
+
320
+ @SBAR < /^WH.*-([0-9]+)$/#1%index<<(__=empty < (/^-NONE-/< /^\\*T\\*-([0-9]+)$/#1%index))
321
+
322
+ will match only such that the WH- node under the SBAR is coindexed with the trace node that gets the name empty.
323
+
324
+
325
+ MISCELLANEOUS
326
+ -----------------------------------------------
327
+
328
+ Head Finders
329
+
330
+ To use the headship relations <# ># <<# >># correctly it is
331
+ important to specify a HeadFinder class appropriate to the trees
332
+ that you are searching. For information about how to specify a
333
+ HeadFinder class at the command line or through the API, please read
334
+ the javadocs for the class
335
+ edu.stanford.nlp.trees.tregex.TregexPattern. The following
336
+ HeadFinder classes are included with the Tregex distribution:
337
+
338
+ Penn Treebank of English (http://www.cis.upenn.edu/~treebank/):
339
+
340
+ edu.stanford.nlp.trees.CollinsHeadFinder (default)
341
+
342
+ Penn Treebank of Chinese (http://www.cis.upenn.edu/~chinese/):
343
+
344
+ edu.stanford.nlp.trees.international.pennchinese.ChineseHeadFinder
345
+
346
+ Penn Treebank of Arabic (http://www.ircs.upenn.edu/arabic/):
347
+
348
+ edu.stanford.nlp.trees.international.arabic.ArabicHeadFinder
349
+
350
+ NEGRA (http://www.coli.uni-saarland.de/projects/sfb378/negra-corpus/)
351
+
352
+ and
353
+
354
+ TIGER (http://www.ims.uni-stuttgart.de/projekte/TIGER/TIGERCorpus/)
355
+
356
+ treebanks of German (these can use the same headfinder):
357
+
358
+ edu.stanford.nlp.trees.international.negra.NegraHeadFinder
359
+
360
+ Tuebingen Treebank of Written German (http://www.sfs.uni-tuebingen.de/en_tuebadz.shtml):
361
+
362
+ edu.stanford.nlp.trees.international.tuebadz.TueBaDZHeadFinder
363
+
364
+
365
+ Tdiff
366
+
367
+ TregexGUI supports a constituent diff'ing method--similar to the UNIX diff command--for trees. To
368
+ enable Tdiff:
369
+ 1) Clear the tree file list: File -> Clear tree file list
370
+ 2) Enable Tdiff: Options -> Tdiff
371
+ 3) Load two (2) files using the "File -> Load" dialog.
372
+ 4) Select "Browse" on the main display
373
+
374
+ The GUI will display differences between each pair of trees in the two files. As such, the two files must
375
+ contain the same number of trees.
376
+
377
+ The first file in the tree file list is treated as the reference. Trees from the second file
378
+ will be displayed in the GUI, with bracketing differences highlighted in blue. Below the tree,
379
+ constituents in the reference tree that do not appear in the tree from the second file are shown
380
+ as lines below each respective span.
381
+
382
+ Tregex searches are supported and apply to the trees in the second file.
383
+
384
+ This feature was designed for debugging and analyzing parser output.
385
+
386
+ THANKS
387
+ -----------------------------------------------
388
+
389
+ Thanks to the members of the Stanford Natural Language Processing Lab
390
+ for great collaborative work on Java libraries for natural language
391
+ processing.
392
+
393
+ http://nlp.stanford.edu/javanlp/
394
+
395
+ LICENSE
396
+ -----------------------------------------------
397
+
398
+ Tregex, Tsurgeon, and Interactive Tregex
399
+ Copyright (c) 2003-2012 The Board of Trustees of
400
+ The Leland Stanford Junior University. All Rights Reserved.
401
+
402
+ This program is free software; you can redistribute it and/or
403
+ modify it under the terms of the GNU General Public License
404
+ as published by the Free Software Foundation; either version 2
405
+ of the License, or (at your option) any later version.
406
+
407
+ This program is distributed in the hope that it will be useful,
408
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
409
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
410
+ GNU General Public License for more details.
411
+
412
+ You should have received a copy of the GNU General Public License
413
+ along with this program; if not, write to the Free Software
414
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
415
+
416
+ For more information, bug reports, fixes, contact:
417
+ Christopher Manning
418
+ Dept of Computer Science, Gates 1A
419
+ Stanford CA 94305-9010
420
+ USA
421
422
+ http://www-nlp.stanford.edu/software/tregex.shtml
423
+
424
+
425
+ CONTACT
426
+ -----------------------------------------------
427
+
428
+ For questions about this distribution, please contact Stanford's JavaNLP group at
429
+ [email protected]. We provide assistance on a best-effort basis.
lng/L2SCA/README-tsurgeon.txt ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Tsurgeon v3.3.1 - 2014-01-04
2
+ ----------------------------------------------
3
+
4
+ Copyright (c) 2003-2012 The Board of Trustees of
5
+ The Leland Stanford Junior University. All Rights Reserved.
6
+
7
+ Original core Tregex code by Roger Levy and Galen Andrew.
8
+ Original core Tsurgeon code by Roger Levy.
9
+ GUI by Anna Rafferty
10
+ Support code, additional features, etc. by Chris Manning
11
+ This release prepared by John Bauer.
12
+
13
+ This package contains Tregex and Tsurgeon.
14
+
15
+ Tregex is a Tgrep2-style utility for matching patterns in trees. It can
16
+ be run in a graphical user interface, from the command line using the
17
+ TregexPattern main method, or used programmatically in java code via the
18
+ TregexPattern, TregexMatcher and TregexPatternCompiler classes.
19
+
20
+ As of version 1.2, the Tsurgeon tree-transformation utility is bundled
21
+ together with Tregex. See the file README.tsurgeon for details.
22
+
23
+ Java version 1.6 is required to use Tregex. If you really want to use
24
+ Tregex under an earlier version of Java, look into RetroWeaver:
25
+
26
+ http://retroweaver.sourceforge.net/
27
+
28
+ TSURGEON
29
+ ----------------------------------------------
30
+ Tsurgeon is a tool for modifying trees that match a particular Tregex
31
+ pattern. Further documentation for Tregex and Tregex GUI can be found in
32
+ README-tregex.txt and README-gui.txt, respectively.
33
+
34
+ ----------------------------------------------
35
+
36
+ Brief description:
37
+
38
+ Takes some trees, tries to match one or more tregex expressions to
39
+ each tree, and for each successful match applies some surgical
40
+ operations to the tree. Pretty-prints each resulting tree (after all
41
+ successful match/operation sets have applied) to standard output.
42
+
43
+
44
+ A simple example:
45
+
46
+ ./tsurgeon.csh -treeFile atree exciseNP renameVerb
47
+
48
+ -----------------------------------------
49
+ RUNNING TREGEX
50
+ -----------------------------------------
51
+
52
+ Program Command Line Options:
53
+
54
+ -treeFile <filename>
55
+
56
+ specify the name of the file that has the trees you want to transform.
57
+
58
+ -po <matchPattern> <operation>
59
+
60
+ Apply a single operation to every tree using the specified match
61
+ pattern and the specified operation.
62
+
63
+ -s
64
+
65
+ Prints the output trees one per line, instead of pretty-printed.
66
+
67
+ The arguments are then Tsurgeon scripts.
68
+ Each argument should be the name of a transformation file that contains a list of pattern
69
+ and transformation operation list pairs. That is, it is a sequence of pairs of a
70
+ TregexPattern pattern on one or more lines, then a
71
+ blank line (empty or whitespace), then a list of transformation operations one per line
72
+ (as specified by Tsurgeon syntax below to apply when the pattern is matched,
73
+ and then another blank line (empty or whitespace).
74
+ Note the need for blank lines: The code crashes if they are not present as separators
75
+ (although the blank line at the end of the file can be omitted).
76
+ The script file can include comment lines, either whole comment lines or
77
+ trailing comments introduced by %, which extend to the end of line. A needed percent
78
+ mark in partterns or operations can be escaped by a preceding backslash.
79
+
80
+ -----------------------------------------
81
+ TSURGEON SYNTAX
82
+ -----------------------------------------
83
+
84
+ Legal operation syntax and semantics (see Examples section for further detail):
85
+
86
+ delete <name_1> <name_2> ... <name_m>
87
+
88
+ For each name_i, deletes the node it names and everything below it.
89
+
90
+ prune <name_1> <name_2> ... <name_m>
91
+
92
+ For each name_i, prunes out the node it names. Pruning differs from
93
+ deletion in that if pruning a node causes its parent to have no
94
+ children, then the parent is in turn pruned too.
95
+
96
+ excise <name1> <name2>
97
+
98
+ The name1 node should either dominate or be the same as the name2
99
+ node. This excises out everything from name1 to name2. All the
100
+ children of name2 go into the parent of name1, where name1 was.
101
+
102
+ relabel <name> <new-label>
103
+
104
+ Relabels the node to have the new label. There are three possible forms
105
+ for the new-label:
106
+ relabel nodeX VP - for changing a node label to an alphanumeric
107
+ string, relabel nodeX /''/ - for relabeling a node to something that
108
+ isn't a valid identifier without quoting, and relabel nodeX
109
+ /^VB(.*)$/verb\/$1/ - for regular expression based relabeling. In the
110
+ last case, all matches of the regular expression against the node
111
+ label are replaced with the replacement String. This has the semantics
112
+ of Java/Perl's replaceAll: you may use capturing groups and put them
113
+ in replacements with $n. Also, as in the example, you can escape a
114
+ slash in the middle of the second and third forms with \/ and \\.
115
+ This last version lets you make a new label that is an arbitrary
116
+ String function of the original label and additional characters that
117
+ you supply.
118
+
119
+ insert <name> <position>
120
+ insert <tree> <position>
121
+
122
+ inserts the named node, or a manually specified tree (see below for
123
+ syntax), into the position specified. Right now the only ways to
124
+ specify position are:
125
+
126
+ $+ <name> the left sister of the named node
127
+ $- <name> the right sister of the named node
128
+ >i <name> the i_th daughter of the named node.
129
+ >-i <name> the i_th daughter, counting from the right, of the named node.
130
+
131
+ move <name> <position>
132
+
133
+ moves the named node into the specified position. To be precise, it
134
+ deletes (*NOT* prunes) the node from the tree, and re-inserts it
135
+ into the specified position. See above for how to specify position
136
+
137
+ replace <name1> <name2>
138
+
139
+ deletes name1 and inserts a copy of name2 in its place.
140
+
141
+ adjoin <tree> <target-node>
142
+
143
+ adjoins the specified auxiliary tree (see below for syntax) into the
144
+ target node specified. The daughters of the target node will become
145
+ the daughters of the foot of the auxiliary tree.
146
+
147
+ adjoinH <tree> <target-node>
148
+
149
+ similar to adjoin, but preserves the target node and makes it the root
150
+ of <tree>. (It is still accessible as <code>name</code>. The root of
151
+ the auxiliary tree is ignored.)
152
+
153
+ adjoinF <tree> <target-node>
154
+
155
+ similar to adjoin, but preserves the target node and makes it the foot
156
+ of <tree>. (It is still accessible as <code>name</code>, and retains
157
+ its status as parent of its children. The foot of the auxiliary tree
158
+ is ignored.)
159
+
160
+ coindex <name_1> <name_2> ... <name_m>
161
+
162
+ Puts a (Penn Treebank style) coindexation suffix of the form "-N" on
163
+ each of nodes name_1 through name_m. The value of N will be
164
+ automatically generated in reference to the existing coindexations
165
+ in the tree, so that there is never an accidental clash of
166
+ indices across things that are not meant to be coindexed.
167
+
168
+ -----------------------------------------
169
+
170
+ Syntax for trees to be inserted or adjoined:
171
+
172
+
173
+ A tree to be adjoined in can be specified with LISP-like
174
+ parenthetical-bracketing tree syntax such as those used for the Penn
175
+ Treebank. For example, for the NP "the dog" to be inserted you might
176
+ use the syntax
177
+
178
+ (NP (Det the) (N dog))
179
+
180
+ That's all that there is for a tree to be inserted. Auxiliary trees
181
+ (a la Tree Adjoining Grammar) must also have exactly one frontier node
182
+ ending in the character "@", which marks it as the "foot" node for
183
+ adjunction. Final instances of the character "@" in terminal node labels
184
+ will be removed from the actual label of the tree.
185
+
186
+ For example, if you wanted to adjoin the adverb "breathlessly" into a
187
+ VP, you might specify the following auxiliary tree:
188
+
189
+ (VP (Adv breathlessly) VP@ )
190
+
191
+ All other instances of "@" in terminal nodes must be escaped (i.e.,
192
+ appear as \@); this escaping will be removed by tsurgeon.
193
+
194
+ In addition, any node of a tree can be named (the same way as in
195
+ tregex), by appending =<name> to the node label. That name can be
196
+ referred to by subsequent tsurgeon operations triggered by the same
197
+ match. All other instances of "=" in node labels must be escaped
198
+ (i.e., appear as \=); this escaping will be removed by tsurgeon. For
199
+ example, if you want to insert an NP trace somewhere and coindex it
200
+ with a node named "antecedent" you might say
201
+
202
+ insert (NP (-NONE- *T*=trace)) <node-location>
203
+ coindex trace antecedent $
204
+
205
+ -----------------------------------------
206
+ Examples of Tsurgeon operations:
207
+
208
+ Tree (used in all examples):
209
+ (ROOT
210
+ (S
211
+ (NP (NNP Maria_Eugenia_Ochoa_Garcia))
212
+ (VP (VBD was)
213
+ (VP (VBN arrested)
214
+ (PP (IN in)
215
+ (NP (NNP May)))))
216
+ (. .)))
217
+
218
+ Apply delete:
219
+ VP < PP=prep
220
+ delete prep
221
+ Result:
222
+ (ROOT
223
+ (S
224
+ (NP (NNP Maria_Eugenia_Ochoa_Garcia))
225
+ (VP (VBD was)
226
+ (VP (VBN arrested)
227
+ (. .)))
228
+ The PP node directly dominated by a VP is removed, as is
229
+ everything under it.
230
+
231
+ Apply prune:
232
+ S < (NP < NNP=noun)
233
+ prune noun
234
+ Result:
235
+ (ROOT
236
+ (S
237
+ (VP (VBD was)
238
+ (VP (VBN arrested)
239
+ (PP (IN in)
240
+ (NP (NNP May)))))
241
+ (. .)))
242
+ The NNP node is removed, and since this results in the NP above it
243
+ having no terminal children, the NP node is deleted as well.
244
+ Note: This is different from delete in which the NP above the NNP
245
+ would remain.
246
+
247
+ Apply excise:
248
+ VP < PP=prep
249
+ excise prep prep
250
+ Result:
251
+ (ROOT
252
+ (S
253
+ (NP (NNP Maria_Eugenia_Ochoa_Garcia))
254
+ (VP (VBD was)
255
+ (VP (VBN arrested)
256
+ (IN in)
257
+ (NP (NNP May)))))
258
+ (. .)))
259
+ The PP node is removed, and all of its children are added in the
260
+ place it was previously located. Excise removes all the nodes from
261
+ the first named node to the second named node, and the children of
262
+ the second node are added as children of the parent of the first node.
263
+ Thus, for another example:
264
+ VP=verb < PP=prep
265
+ excise verb prep
266
+ Result:
267
+ (ROOT
268
+ (S
269
+ (NP (NNP Maria_Eugenia_Ochoa_Garcia))
270
+ (VP (VBD was)
271
+ (IN in)
272
+ (NP (NNP May)))
273
+ (. .)))
274
+
275
+
276
+ Apply relabel:
277
+ VP=v < PP=prep
278
+ relabel prep verbPrep
279
+ Result:
280
+ (ROOT
281
+ (S
282
+ (NP (NNP Maria_Eugenia_Ochoa_Garcia))
283
+ (VP (VBD was)
284
+ (VP (VBN arrested)
285
+ (verbPrep (IN in)
286
+ (NP (NNP May)))))
287
+ (. .)))
288
+ The label for the node called prep (PP) is changed to verbPrep.
289
+ The other form of relabel uses regular expressions; consider the following
290
+ operation:
291
+ /^VB.+/=v
292
+ relabel v /^VB(.*)$/ #1
293
+ Result:
294
+ (ROOT
295
+ (S
296
+ (NP (NNP Maria_Eugenia_Ochoa_Garcia))
297
+ (VP (D was)
298
+ (VP (N arrested)
299
+ (PP (IN in)
300
+ (NP (NNP May)))))
301
+ (. .)))
302
+ The Tregex pattern matches all nodes that begin "VB" and have at least one
303
+ more character. The Tsurgeon operation then matches the node label to the
304
+ regular expression "^VB(.*)$" and selects the text matching the first part
305
+ that is not completely specified in the pattern. In this case, that is the
306
+ part matching the wildcard (.*), which matches all characters after the VB.
307
+ The node is then relabeled with that part of the text, causing, for example,
308
+ "VBD" to be relabeled "D". The "#1" specifies that the name of the node
309
+ should be the first group in the regex.
310
+
311
+ Apply insert (shown here with inserting a node, but could also be a tree):
312
+ S < (NP < (NNP=name !$- DET))
313
+ insert (DET Ms.) $+ name
314
+ Result:
315
+ (ROOT
316
+ (S
317
+ (NP (DET Ms.)
318
+ (NNP Maria_Eugenia_Ochoa_Garcia))
319
+ (VP (VBD was)
320
+ (VP (VBN arrested)
321
+ (PP (IN in)
322
+ (NP (NNP May)))))
323
+ (. .)))
324
+ The pattern matches the NNP node that is directly dominated by an NP
325
+ (which is directly dominated by an S) and is not a direct right sister
326
+ of a DET. Thus, the (DET Ms.) node is inserted immediately to the left
327
+ of that NNP node, as specified by "$+ name". "$+" is the location and
328
+ "name" describes what node the location is with respect to.
329
+ Note: Tsurgeon will re-search for matches after each run of the script;
330
+ thus, cycles may occur, causing the program to not terminate. The key
331
+ is to write patterns that match prior to the changes you would like to
332
+ make but that do not match afterwards. If the clause "!$- DET" had been
333
+ left out in this example, Tsurgeon would have matched the pattern after
334
+ every insert operation, causing an infinite number of DETs to be added.
335
+
336
+ Apply move:
337
+ VP=verb < PP=prep
338
+ move prep $- verb
339
+ Result:
340
+ (ROOT
341
+ (S
342
+ (NP (NNP Maria_Eugenia_Ochoa_Garcia))
343
+ (VP (VBD was)
344
+ (VP (VBN arrested)))
345
+ (PP (IN in)
346
+ (NP (NNP May)))
347
+ (. .)))
348
+ The PP is moved out of the VP that dominates it and added as a direct right
349
+ sister of the VP. As for insert, "$-" specifies the location for prep while
350
+ "verb" specifies what that location is relative to.
351
+ Note: "move" is a macro operation that deletes the given node and then inserts
352
+ it. "move" does not use prune, and thus any branches that now lack terminals will
353
+ remain rather than being removed.
354
+
355
+ Apply replace:
356
+ S < (NP=name < NNP)
357
+ replace name (NP (DET A) (NN woman))
358
+ Result:
359
+ (ROOT
360
+ (S
361
+ (NP (DET A)
362
+ (NN woman))
363
+ (VP (VBD was)
364
+ (VP (VBN arrested)
365
+ (PP (IN in)
366
+ (NP (NNP May)))))
367
+ (. .)))
368
+ "name" is matched to an NP that is dominated by an S and dominates an NNP, and
369
+ a new subtree ("(NP (DET A) (NN woman))") is added in the place where "name" was.
370
+ Note: This operation is vulnerable to falling into an infinite loop. See the note
371
+ concerning the "insert" operation and how patterns are matched.
372
+
373
+ Apply adjoin:
374
+ S < (NP=name < NNP)
375
+ adjoin (NP (DET A) (NN woman) NP@) name
376
+ Result:
377
+ (ROOT
378
+ (S
379
+ (NP (DET A)
380
+ (NN woman)
381
+ (NP (NNP Maria_Eugenia_Ochoa_Garcia)))
382
+ (VP (VBD was)
383
+ (VP (VBN arrested)
384
+ (PP (IN in)
385
+ (NP (NNP May)))))
386
+ (. .)))
387
+ First, the NP is matched to the NP dominating the NNP tag. Then, the specified
388
+ tree ("(NP (DET A) (NN woman) NP@)") is placed in that location. The "@" symbol
389
+ specifies that the children of the original NP node ("name") are to be placed
390
+ as children of a new NP node that is directly to the right of (NN woman). If
391
+ the specified tree were "(NP (DET A) (NN woman) VP@)" then the child
392
+ (NNP Maria_Eugenia_Ochoa_Garcia) would appear under a VP. Exactly one "@" node
393
+ must appear in the specified tree in order to indicate where to place the node
394
+ from the original tree.
395
+
396
+ Apply adjoinH:
397
+ S < (NP=name < NNP)
398
+ adjoinH ((NP (DET A) (NN woman) NP@)) name
399
+ Result:
400
+ (ROOT
401
+ (S
402
+ (NP (NP (DET A)
403
+ (NN woman)
404
+ (NP (NNP Maria_Eugenia_Ochoa_Garcia))))
405
+ (VP (VBD was)
406
+ (VP (VBN arrested)
407
+ (PP (IN in)
408
+ (NP (NNP May)))))
409
+ (. .)))
410
+ This operation differs from adjoin in that it retains the named node (in this
411
+ case, "name"). The named node is made the root of the specified tree, resulting
412
+ in two NP nodes dominating the DET in this example whereas only one was present
413
+ in the previous example. Note that the specified tree is wrapped in an extra
414
+ pair of parentheses in order to show the syntax for retaining the named node.
415
+ If the extra parentheses were not there and the specified tree was, for example,
416
+ (VP (DET A) (NN woman) NP@), the VP would be ignored in order to retain an NP as
417
+ the root. Thus, in this case, "adjoinH (VP (DET A) (NN woman) NP@) name" and
418
+ "adjoinH ((DET A) (NN woman) NP@) name" both produce the same tree:
419
+ (ROOT
420
+ (S
421
+ (NP (DET A)
422
+ (NN woman)
423
+ (NP (NNP Maria_Eugenia_Ochoa_Garcia)))
424
+ (VP (VBD was)
425
+ (VP (VBN arrested)
426
+ (PP (IN in)
427
+ (NP (NNP May)))))
428
+ (. .)))
429
+
430
+
431
+ Apply adjoinF:
432
+ S < (NP=name < NNP)
433
+ adjoinF (NP(DET A) (NN woman) @) name
434
+ Result:
435
+ (ROOT
436
+ (S
437
+ (NP (DET A)
438
+ (NN woman)
439
+ (NP (NNP Maria_Eugenia_Ochoa_Garcia)))
440
+ (VP (VBD was)
441
+ (VP (VBN arrested)
442
+ (PP (IN in)
443
+ (NP (NNP May)))))
444
+ (. .)))
445
+ This operation is very similar to adjoin and adjoinH, but this time the original
446
+ named node ("name" in this case) is maintained as the root of the subtree that
447
+ is adjoined. Thus, no node label needs to be given in front of the "@" and if
448
+ one is given, it will be ignored. For instance, "adjoinF (NP(DET A) (NN woman) VP@) name"
449
+ would still produce the same tree as above, despite the VP preceding the @.
450
+
451
+ Apply coindex:
452
+ NP=node < NNP=name
453
+ coindex node name
454
+ Result:
455
+ (ROOT
456
+ (S
457
+ (NP-1 (NNP-1 Maria_Eugenia_Ochoa_Garcia))
458
+ (VP (VBD was)
459
+ (VP (VBN arrested)
460
+ (PP (IN in)
461
+ (NP-2 (NNP-2 May)))))
462
+ (. .)))
463
+ This causes the named nodes to be numbered such that all nodes that are part
464
+ of the same match have the same number and all matches have distinct new names.
465
+ We had two instances of an NP dominating an NNP in this example, and they were
466
+ renamed such that NP-i < NNP-i for each match, with 1 <= i <= number of matches.
467
+
468
+ -----------------------------------------
469
+ TSURGEON SCRIPTS
470
+ -----------------------------------------
471
+ Script format:
472
+
473
+ Tsurgeon scripts are a combination of a Tregex pattern to match and a series
474
+ of Tsurgeon operations to perform on that match. The first line of a Tsurgeon
475
+ script should be the Tregex pattern. This should be followed by a blank line,
476
+ and then each subsequent line may contain one Tsurgeon operation. Tsurgeon
477
+ operations should not be separated by blank lines. The following is an example
478
+ of correctly formatted script:
479
+
480
+ S < NP=node < NNP=name
481
+
482
+ relabel node NP_NAME
483
+ coindex node name
484
+
485
+
486
+ Comments:
487
+
488
+ The character % introduces a comment that extends to the end of the
489
+ line. All other intended uses of % must be escaped as \% .
490
+
491
+ -----------------------------------------
492
+ CONTACT
493
+ -----------------------------------------
494
+
495
+ For questions about this distribution, please contact Stanford's JavaNLP group at
496
+ [email protected]. We provide assistance on a best-effort basis.
497
+
498
+
499
+ -----------------------------------------
500
+ LICENSE
501
+ -----------------------------------------
502
+
503
+ Tregex, Tsurgeon, and Interactive Tregex
504
+ Copyright (c) 2003-2011 The Board of Trustees of
505
+ The Leland Stanford Junior University. All Rights Reserved.
506
+
507
+ This program is free software; you can redistribute it and/or
508
+ modify it under the terms of the GNU General Public License
509
+ as published by the Free Software Foundation; either version 2
510
+ of the License, or (at your option) any later version.
511
+
512
+ This program is distributed in the hope that it will be useful,
513
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
514
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
515
+ GNU General Public License for more details.
516
+
517
+ You should have received a copy of the GNU General Public License
518
+ along with this program; if not, write to the Free Software
519
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
520
+
521
+ For more information, bug reports, fixes, contact:
522
+ Christopher Manning
523
+ Dept of Computer Science, Gates 1A
524
+ Stanford CA 94305-9010
525
+ USA
526
527
+ http://www-nlp.stanford.edu/software/tregex.shtml
528
+
529
+
lng/L2SCA/Semgrex.ppt ADDED
Binary file (285 kB). View file
 
lng/L2SCA/analyzeFolder.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script analyzes all text files (with the .txt suffix only) in a single folder or directory.
3
+
4
+ It counts the occurrences of the following 9 structures in each text: words (W), sentences (S), verb phrases (VP), clauses (C), T-units (T), dependent clauses (DC), complex T-units (CT), coordinate phrases (CP), and complex nominals (CN).
5
+
6
+ These frequency counts are then used to compute the following 14 syntactic complexity indices of each text: mean length of sentence (MLS), mean length of T-unit (MLT), mean length of clause (MLC), clauses per sentence (C/S), verb phrases per T-unit (VP/T), clauses per T-unit (C/T), dependent clauses per clause (DC/C), dependent clauses per T-unit (DC/T), T-units per sentence (T/S), complex T-unit ratio (CT/T), coordinate phrases per T-unit (CP/T), coordinate phrases per clause (CP/C), complex nominals per T-unit (CN/T), and complex nominals per clause (CN/C).
7
+
8
+ To run the script, type the following at the command line:
9
+ python analyzeText.py inputFileDirectory outputFileName
10
+
11
+ inputFileDirectory is the path to the directory or folder that contains the text files you want to analyze (e.g., /home/inputFiles/). The path should end with a slash, as in the example. outputFileName is the name you want to assign to the output file. Both must be provided.
12
+
13
+ The first line of the output file will be a comma-delimited list of 24 fields (including Filename, abbreviations of the 9 structures, and abbreviations of the 14 syntactic complexity indices). The subsequent lines of the file will each provide a comma-delimited list of 24 values for one input file (including the name of the file, frequency counts of the 9 structures, and the values of the 14 syntactic complexity indices). This format may be hard to read but allows easy import to Excel or SPSS.
14
+ """
15
+
16
+ import sys, os, subprocess, glob, re
17
+
18
+ #a function to divide two numbers from strings
19
+ def division(x,y):
20
+ if float(x)==0 or float(y)==0:
21
+ return 0
22
+ return float(x)/float(y)
23
+
24
+ #the following is a list of tregex patterns for various structures
25
+
26
+ #sentence (S)
27
+ s="'ROOT'"
28
+
29
+ #verb phrase (VP)
30
+ vp="'VP > S|SINV|SQ'"
31
+ vp_q="'MD|VBZ|VBP|VBD > (SQ !< VP)'"
32
+
33
+ #clause (C)
34
+ c="'S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])]'"
35
+
36
+ #T-unit (T)
37
+ t="'S|SBARQ|SINV|SQ > ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP]'"
38
+
39
+ #dependent clause (DC)
40
+ dc="'SBAR < (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])])'"
41
+
42
+ #complex T-unit (CT)
43
+ ct="'S|SBARQ|SINV|SQ [> ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP]] << (SBAR < (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP\
44
+ |VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])]))'"
45
+
46
+ #coordinate phrase (CP)
47
+ cp="'ADJP|ADVP|NP|VP < CC'"
48
+
49
+ #complex nominal (CN)
50
+ cn1="'NP !> NP [<< JJ|POS|PP|S|VBG | << (NP $++ NP !$+ CC)]'"
51
+ cn2="'SBAR [<# WHNP | <# (IN < That|that|For|for) | <, S] & [$+ VP | > VP]'"
52
+ cn3="'S < (VP <# VBG|TO) $+ VP'"
53
+
54
+ #fragment clause
55
+ fc="'FRAG > ROOT !<< (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])])'"
56
+
57
+ #fragment T-unit
58
+ ft="'FRAG > ROOT !<< (S|SBARQ|SINV|SQ > ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP])'"
59
+
60
+ #list of patterns to search for
61
+ patternlist=[s,vp,c,t,dc,ct,cp,cn1,cn2,cn3,fc,ft,vp_q]
62
+
63
+ #location of the Stanford parser
64
+ parserPath="stanford-parser-full-2014-01-04/lexparser.sh"
65
+
66
+ #path to the directory or folder containing input files
67
+ directoryPath=sys.argv[1]
68
+
69
+ #output file name
70
+ outputFile=open(sys.argv[2],"w")
71
+
72
+ #write a list of 24 comma-delimited fields to the output file
73
+ fields="Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C"
74
+ outputFile.write(fields+"\n")
75
+
76
+ #process text files in the directory one by one
77
+ for filename in glob.glob( os.path.join(directoryPath, '*.txt') ):
78
+ print("Processing "+filename+"...")
79
+
80
+ #Extract the name of the file being processed
81
+ output=filename.split('/')[-1]
82
+
83
+ #name a temporary file to hold the parse trees of the input file
84
+ parsedFile=filename+".parsed"
85
+
86
+ #parse the input file
87
+ command=parserPath + " " + filename + " > " + parsedFile
88
+ a=subprocess.getoutput(command).split('\n')[-1].split()
89
+
90
+ #list of counts of the patterns
91
+ patterncount=[]
92
+
93
+ #query the parse trees using the tregex patterns
94
+ for pattern in patternlist:
95
+ command = "./tregex.sh " + pattern + " " + parsedFile + " -C -o"
96
+ count = subprocess.getoutput(command).split('\n')[-1]
97
+ patterncount.append(int(count))
98
+
99
+ #update frequencies of complex nominals, clauses, and T-units
100
+ patterncount[7]=patterncount[-4]+patterncount[-5]+patterncount[-6]
101
+ patterncount[2]=patterncount[2]+patterncount[-3]
102
+ patterncount[3]=patterncount[3]+patterncount[-2]
103
+ patterncount[1]=patterncount[1]+patterncount[-1]
104
+
105
+ #word count
106
+ infile=open(parsedFile,"r")
107
+ content=infile.read()
108
+ w=len(re.findall("\([A-Z]+\$? [^\)\(]+\)",content))
109
+ infile.close()
110
+
111
+ #add frequencies of words and other structures to output string
112
+ output+=","+str(w) #number of words
113
+ for count in patterncount[:8]:
114
+ output+=","+str(count)
115
+
116
+ #list of frequencies of structures other than words
117
+ [s,vp,c,t,dc,ct,cp,cn]=patterncount[:8]
118
+
119
+ #compute the 14 syntactic complexity indices
120
+ mls=division(w,s)
121
+ mlt=division(w,t)
122
+ mlc=division(w,c)
123
+ c_s=division(c,s)
124
+ vp_t=division(vp,t)
125
+ c_t=division(c,t)
126
+ dc_c=division(dc,c)
127
+ dc_t=division(dc,t)
128
+ t_s=division(t,s)
129
+ ct_t=division(ct,t)
130
+ cp_t=division(cp,t)
131
+ cp_c=division(cp,c)
132
+ cn_t=division(cn,t)
133
+ cn_c=division(cn,c)
134
+
135
+ #add syntactic complexity indices to output string
136
+ for ratio in [mls,mlt,mlc,c_s,vp_t,c_t,dc_c,dc_t,t_s,ct_t,cp_t,cp_c,cn_t,cn_c]:
137
+ output+=","+str("%.4F" % ratio)
138
+
139
+ #write output string to output file
140
+ outputFile.write(output+"\n")
141
+
142
+ #delete the temporary file holding the parse trees
143
+ command="rm "+parsedFile
144
+ os.popen(command)
145
+
146
+ outputFile.close()
147
+
148
+ print("Done. Output was saved to " + sys.argv[2] +".")
lng/L2SCA/analyzeText.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script analyzes a single plain text file.
3
+
4
+ It counts the occurrences of the following 9 structures in the text: words (W), sentences (S), verb phrases (VP), clauses (C), T-units (T), dependent clauses (DC), complex T-units (CT), coordinate phrases (CP), and complex nominals (CN).
5
+
6
+ These frequency counts are then used to compute the following 14 syntactic complexity indices of the text: mean length of sentence (MLS), mean length of T-unit (MLT), mean length of clause (MLC), clauses per sentence (C/S), verb phrases per T-unit (VP/T), clauses per T-unit (C/T), dependent clauses per clause (DC/C), dependent clauses per T-unit (DC/T), T-units per sentence (T/S), complex T-unit ratio (CT/T), coordinate phrases per T-unit (CP/T), coordinate phrases per clause (CP/C), complex nominals per T-unit (CN/T), and complex nominals per clause (CN/C).
7
+
8
+ To run the script, type the following at the command line:
9
+ python analyzeText.py inputFileName outputFileName
10
+
11
+ inputFileName is the name of your input text file. outputFileName is the name you want to assign to the output file. Both names must be provided.
12
+
13
+ The output file will contain 2 lines. The first line is a comma-delimited list of 24 fields (including Filename, abbreviations of the 9 structures, and abbreviations of the 14 syntactic complexity indices). The second line is a comma-delimited list of 24 values (including the name of the input file, frequency counts of the 9 structures, and the values of the 14 syntactic complexity indices). This format may be hard to read but allows easy import to Excel or SPSS.
14
+ """
15
+
16
+ import sys, os, subprocess, re, tempfile
17
+
18
+ #a function to divide two numbers from strings
19
+ def division(x,y):
20
+ if float(x)==0 or float(y)==0:
21
+ return 0
22
+ return float(x)/float(y)
23
+
24
+ #the following is a list of tregex patterns for various structures
25
+
26
+ #sentence (S)
27
+ s="ROOT"
28
+
29
+ #verb phrase (VP)
30
+ vp="VP > S|SINV|SQ"
31
+ vp_q="MD|VBZ|VBP|VBD > (SQ !< VP)"
32
+
33
+ #clause (C)
34
+ c="S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])]"
35
+
36
+ #T-unit (T)
37
+ t="S|SBARQ|SINV|SQ > ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP]"
38
+
39
+ #dependent clause (DC)
40
+ dc="SBAR < (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])])"
41
+
42
+ #complex T-unit (CT)
43
+ ct="S|SBARQ|SINV|SQ [> ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP]] << (SBAR < (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])]))"
44
+
45
+ #coordinate phrase (CP)
46
+ cp="ADJP|ADVP|NP|VP < CC"
47
+
48
+ #complex nominal (CN)
49
+ cn1="NP !> NP [<< JJ|POS|PP|S|VBG | << (NP $++ NP !$+ CC)]"
50
+ cn2="SBAR [<# WHNP | <# (IN < That|that|For|for) | <, S] & [$+ VP | > VP]"
51
+ cn3="S < (VP <# VBG|TO) $+ VP"
52
+
53
+ #fragment clause
54
+ fc="FRAG > ROOT !<< (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])])"
55
+
56
+ #fragment T-unit
57
+ ft="FRAG > ROOT !<< (S|SBARQ|SINV|SQ > ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP])"
58
+
59
+ #list of patterns to search for
60
+ patternlist=[s,vp,c,t,dc,ct,cp,cn1,cn2,cn3,fc,ft,vp_q]
61
+
62
+ pre_path = 'lng/L2SCA'
63
+
64
+ #location of the Stanford parser
65
+ parserPath= os.path.join(pre_path, "stanford-parser-full-2014-01-04/lexparser.sh")
66
+
67
+ def sca(input_text):
68
+ inputFile = '/tmp/%s.txt'%next(tempfile._get_candidate_names())
69
+ with open(inputFile, 'w') as f:
70
+ f.write(input_text + '\n')
71
+
72
+ #extract the name of the file being processed
73
+ output = []
74
+
75
+ #name a temporary file to hold the parse trees of the input file
76
+ parsedFile=inputFile+".parsed"
77
+
78
+ #parse the input file
79
+ command=[parserPath, inputFile]
80
+ with open(parsedFile, 'w') as f:
81
+ subprocess.run(command, stdout = f,
82
+ stderr = subprocess.DEVNULL
83
+ )
84
+
85
+
86
+ #list of counts of the patterns
87
+ patterncount=[]
88
+
89
+ #query the parse trees using the tregex patterns
90
+ for pattern in patternlist:
91
+ command = [os.path.join(pre_path, "tregex.sh"), pattern, parsedFile, "-C", "-o"]
92
+ out = subprocess.run(command, check = True, stdout = subprocess.PIPE, stderr = subprocess.DEVNULL)
93
+ if len(out.stdout) > 0:
94
+ count = int(out.stdout)
95
+ else:
96
+ count = 0
97
+ patterncount.append(count)
98
+
99
+ #update frequencies of complex nominals, clauses, and T-units
100
+ patterncount[7]=patterncount[-4]+patterncount[-5]+patterncount[-6]
101
+ patterncount[2]=patterncount[2]+patterncount[-3]
102
+ patterncount[3]=patterncount[3]+patterncount[-2]
103
+ patterncount[1]=patterncount[1]+patterncount[-1]
104
+
105
+ #word count
106
+ infile=open(parsedFile,"r")
107
+ content=infile.read()
108
+ w=len(re.findall("\([A-Z]+\$? [^\)\(]+\)",content))
109
+ infile.close()
110
+
111
+ #add frequencies of words and other structures to output string
112
+ output.append(int(w))
113
+ for count in patterncount[:8]:
114
+ output.append(int(count))
115
+
116
+ #list of frequencies of structures other than words
117
+ [s,vp,c,t,dc,ct,cp,cn]=patterncount[:8]
118
+
119
+ #compute the 14 syntactic complexity indices
120
+ mls=division(w,s)
121
+ mlt=division(w,t)
122
+ mlc=division(w,c)
123
+ c_s=division(c,s)
124
+ vp_t=division(vp,t)
125
+ c_t=division(c,t)
126
+ dc_c=division(dc,c)
127
+ dc_t=division(dc,t)
128
+ t_s=division(t,s)
129
+ ct_t=division(ct,t)
130
+ cp_t=division(cp,t)
131
+ cp_c=division(cp,c)
132
+ cn_t=division(cn,t)
133
+ cn_c=division(cn,c)
134
+
135
+ #add syntactic complexity indices to output string
136
+ for ratio in [mls,mlt,mlc,c_s,vp_t,c_t,dc_c,dc_t,t_s,ct_t,cp_t,cp_c,cn_t,cn_c]:
137
+ output.append(ratio)
138
+
139
+ #list of 24 comma-delimited fields
140
+ # fields="Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C"
141
+
142
+ #delete the temporary file holding the parse trees
143
+ os.remove(inputFile)
144
+ os.remove(parsedFile)
145
+
146
+ return output
lng/L2SCA/examples/atree ADDED
@@ -0,0 +1 @@
 
 
1
+ (VP (VP (VBZ Try) (NP (NP (DT this) (NN wine)) (CC and) (NP (DT these) (NNS snails)))) (PUNCT .))
lng/L2SCA/examples/exciseNP ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ NP < (NP=np < NNS) < (NP=np1 < NN)
2
+
3
+ excise np np
4
+ excise np1 np1
5
+
6
+
lng/L2SCA/examples/relabelWithGroupName ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ /^VB.+/=v
2
+
3
+ relabel v /^VB(.*)$/ #1
4
+
lng/L2SCA/examples/renameVerb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ VBZ=vbz $ NP
2
+
3
+ relabel vbz MYVERB
lng/L2SCA/lib/ABOUT-AppleJavaExtensions.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AppleJavaExtensions.jar provides some stub methods to allow
2
+ compilation of code which contains Java methods that reference Mac OS
3
+ X specific Java APIs on any platform. This is needed only for
4
+ compilation of the class edu.stanford.nlp.trees.tregex.gui.OSXAdapter .
5
+ Using this class and the links to Apple-specific technologies is
6
+ required to allow the Mac version of Tregex to behave like a normal
7
+ Mac application in responding to About and Preferences... menu items.
8
+
9
+ This library is not needed or used at runtime on any platform.
10
+
11
+ If you'd prefer not to have these complications in the source for your
12
+ use on other platforms, simply delete both AppleJavaExtensions.jar and
13
+ the file src/edu/stanford/nlp/trees/tregex/gui/OSXAdapter.java . The
14
+ OSXAdapter class is loaded using reflection by the main TregexGUI
15
+ class, so its absence will not cause any errors in compilation.
16
+
17
+ The file README-AppleJavaExtensions.txt contains Apple's README and
18
+ license information for AppleJavaExtensions.jar . More information on
19
+ AppleJavaExtensions can be found at:
20
+
21
+ http://developer.apple.com/samplecode/AppleJavaExtensions/
22
+
23
+ This issue of needing to include AppleJavaExtensions.jar occurs for
24
+ many Java GUI programs which want to function well on Mac OS X,
25
+ including NetBeans, FindBugs, etc. Do a Google search on:
26
+
27
+ AppleJavaExtensions license
28
+
29
+ to find examples.
lng/L2SCA/lib/AppleJavaExtensions.jar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0f46aaca0deba5d07490f66d420a58e5a17e4fe8b5118a3ae831207d953f52b
3
+ size 4189
lng/L2SCA/lib/README-AppleJavaExtensions.txt ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AppleJavaExtensions
2
+ v 1.2
3
+
4
+ This is a pluggable jar of stub classes representing the new Apple eAWT and eIO APIs for Java 1.4 on Mac OS X. The purpose of these stubs is to allow for compilation of eAWT- or eIO-referencing code on platforms other than Mac OS X. The jar file is enclosed in a zip archive for easy expansion on other platforms.
5
+
6
+ These stubs are not intended for the runtime classpath on non-Mac platforms. Please see the OSXAdapter sample for how to write cross-platform code that uses eAWT.
7
+
8
+ Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple
9
+ Computer, Inc. ("Apple") in consideration of your agreement to the
10
+ following terms, and your use, installation, modification or
11
+ redistribution of this Apple software constitutes acceptance of these
12
+ terms. If you do not agree with these terms, please do not use,
13
+ install, modify or redistribute this Apple software.
14
+
15
+ In consideration of your agreement to abide by the following terms, and
16
+ subject to these terms, Apple grants you a personal, non-exclusive
17
+ license, under Apple's copyrights in this original Apple software (the
18
+ "Apple Software"), to use, reproduce, modify and redistribute the Apple
19
+ Software, with or without modifications, in source and/or binary forms;
20
+ provided that if you redistribute the Apple Software in its entirety and
21
+ without modifications, you must retain this notice and the following
22
+ text and disclaimers in all such redistributions of the Apple Software.
23
+ Neither the name, trademarks, service marks or logos of Apple Computer,
24
+ Inc. may be used to endorse or promote products derived from the Apple
25
+ Software without specific prior written permission from Apple. Except
26
+ as expressly stated in this notice, no other rights or licenses, express
27
+ or implied, are granted by Apple herein, including but not limited to
28
+ any patent rights that may be infringed by your derivative works or by
29
+ other works in which the Apple Software may be incorporated.
30
+
31
+ The Apple Software is provided by Apple on an "AS IS" basis. APPLE
32
+ MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
33
+ THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
34
+ FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
35
+ OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
36
+
37
+ IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
38
+ OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
39
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
40
+ INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
41
+ MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
42
+ AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
43
+ STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
44
+ POSSIBILITY OF SUCH DAMAGE.
45
+
46
+ Copyright © 2003-2006 Apple Computer, Inc., All Rights Reserved
lng/L2SCA/run-tregex-gui.bat ADDED
@@ -0,0 +1 @@
 
 
1
+ java -mx300m -cp "stanford-tregex.jar;" edu.stanford.nlp.trees.tregex.gui.TregexGUI
lng/L2SCA/run-tregex-gui.command ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ #!/bin/sh
2
+ java -mx300m -cp `dirname $0`/stanford-tregex.jar edu.stanford.nlp.trees.tregex.gui.TregexGUI
lng/L2SCA/samples/my_sample.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ A few people in a restaurant setting, one of them is drinking orange juice.
lng/L2SCA/samples/sample1.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Scores of properties are under extreme fire threat as a huge blaze
2
+ continues to advance through Sydney's north-western suburbs. Fires
3
+ have also shut down the major road and rail links between Sydney and
4
+ Gosford.
5
+
6
+ The promotional stop in Sydney was everything to be expected for a
7
+ Hollywood blockbuster - phalanxes of photographers, a stretch limo to
8
+ a hotel across the Quay - but with one difference. A line-up of
9
+ masseurs was waiting to take the media in hand. Never has the term
10
+ "massaging the media" seemed so accurate.
lng/L2SCA/samples/sample1_output ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C
2
+ sample1.txt,87,5,11,7,5,2,2,2,12,17.4000,17.4000,12.4286,1.4000,2.2000,1.4000,0.2857,0.4000,1.0000,0.4000,0.4000,0.2857,2.4000,1.7143
lng/L2SCA/samples/sample2.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Naaman was the captain of the host of King of Syria. He was a great and honourable man but he had a disease called Leprosy. The little maid that served Naaman's wife told her if her master, Naaman, was with the prophet in Samaria, he would be healed. So the King of Syria sent Naaman, along with a letter to the King of Israel. When the King of Israel received the letter, he tore his clothes. The King was upset because he knew that he could not heal Naaman.
lng/L2SCA/samples/samples_output ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C
2
+ sample1.txt,87,5,11,7,5,2,2,2,12,17.4000,17.4000,12.4286,1.4000,2.2000,1.4000,0.2857,0.4000,1.0000,0.4000,0.4000,0.2857,2.4000,1.7143
3
+ sample2.txt,90,6,13,13,7,5,3,1,13,15.0000,12.8571,6.9231,2.1667,1.8571,1.8571,0.3846,0.7143,1.1667,0.4286,0.1429,0.0769,1.8571,1.0000
lng/L2SCA/stanford-parser-full-2014-01-04/LICENSE.txt ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GNU GENERAL PUBLIC LICENSE
2
+ Version 2, June 1991
3
+
4
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
5
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6
+ Everyone is permitted to copy and distribute verbatim copies
7
+ of this license document, but changing it is not allowed.
8
+
9
+ Preamble
10
+
11
+ The licenses for most software are designed to take away your
12
+ freedom to share and change it. By contrast, the GNU General Public
13
+ License is intended to guarantee your freedom to share and change free
14
+ software--to make sure the software is free for all its users. This
15
+ General Public License applies to most of the Free Software
16
+ Foundation's software and to any other program whose authors commit to
17
+ using it. (Some other Free Software Foundation software is covered by
18
+ the GNU Lesser General Public License instead.) You can apply it to
19
+ your programs, too.
20
+
21
+ When we speak of free software, we are referring to freedom, not
22
+ price. Our General Public Licenses are designed to make sure that you
23
+ have the freedom to distribute copies of free software (and charge for
24
+ this service if you wish), that you receive source code or can get it
25
+ if you want it, that you can change the software or use pieces of it
26
+ in new free programs; and that you know you can do these things.
27
+
28
+ To protect your rights, we need to make restrictions that forbid
29
+ anyone to deny you these rights or to ask you to surrender the rights.
30
+ These restrictions translate to certain responsibilities for you if you
31
+ distribute copies of the software, or if you modify it.
32
+
33
+ For example, if you distribute copies of such a program, whether
34
+ gratis or for a fee, you must give the recipients all the rights that
35
+ you have. You must make sure that they, too, receive or can get the
36
+ source code. And you must show them these terms so they know their
37
+ rights.
38
+
39
+ We protect your rights with two steps: (1) copyright the software, and
40
+ (2) offer you this license which gives you legal permission to copy,
41
+ distribute and/or modify the software.
42
+
43
+ Also, for each author's protection and ours, we want to make certain
44
+ that everyone understands that there is no warranty for this free
45
+ software. If the software is modified by someone else and passed on, we
46
+ want its recipients to know that what they have is not the original, so
47
+ that any problems introduced by others will not reflect on the original
48
+ authors' reputations.
49
+
50
+ Finally, any free program is threatened constantly by software
51
+ patents. We wish to avoid the danger that redistributors of a free
52
+ program will individually obtain patent licenses, in effect making the
53
+ program proprietary. To prevent this, we have made it clear that any
54
+ patent must be licensed for everyone's free use or not licensed at all.
55
+
56
+ The precise terms and conditions for copying, distribution and
57
+ modification follow.
58
+
59
+ GNU GENERAL PUBLIC LICENSE
60
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61
+
62
+ 0. This License applies to any program or other work which contains
63
+ a notice placed by the copyright holder saying it may be distributed
64
+ under the terms of this General Public License. The "Program", below,
65
+ refers to any such program or work, and a "work based on the Program"
66
+ means either the Program or any derivative work under copyright law:
67
+ that is to say, a work containing the Program or a portion of it,
68
+ either verbatim or with modifications and/or translated into another
69
+ language. (Hereinafter, translation is included without limitation in
70
+ the term "modification".) Each licensee is addressed as "you".
71
+
72
+ Activities other than copying, distribution and modification are not
73
+ covered by this License; they are outside its scope. The act of
74
+ running the Program is not restricted, and the output from the Program
75
+ is covered only if its contents constitute a work based on the
76
+ Program (independent of having been made by running the Program).
77
+ Whether that is true depends on what the Program does.
78
+
79
+ 1. You may copy and distribute verbatim copies of the Program's
80
+ source code as you receive it, in any medium, provided that you
81
+ conspicuously and appropriately publish on each copy an appropriate
82
+ copyright notice and disclaimer of warranty; keep intact all the
83
+ notices that refer to this License and to the absence of any warranty;
84
+ and give any other recipients of the Program a copy of this License
85
+ along with the Program.
86
+
87
+ You may charge a fee for the physical act of transferring a copy, and
88
+ you may at your option offer warranty protection in exchange for a fee.
89
+
90
+ 2. You may modify your copy or copies of the Program or any portion
91
+ of it, thus forming a work based on the Program, and copy and
92
+ distribute such modifications or work under the terms of Section 1
93
+ above, provided that you also meet all of these conditions:
94
+
95
+ a) You must cause the modified files to carry prominent notices
96
+ stating that you changed the files and the date of any change.
97
+
98
+ b) You must cause any work that you distribute or publish, that in
99
+ whole or in part contains or is derived from the Program or any
100
+ part thereof, to be licensed as a whole at no charge to all third
101
+ parties under the terms of this License.
102
+
103
+ c) If the modified program normally reads commands interactively
104
+ when run, you must cause it, when started running for such
105
+ interactive use in the most ordinary way, to print or display an
106
+ announcement including an appropriate copyright notice and a
107
+ notice that there is no warranty (or else, saying that you provide
108
+ a warranty) and that users may redistribute the program under
109
+ these conditions, and telling the user how to view a copy of this
110
+ License. (Exception: if the Program itself is interactive but
111
+ does not normally print such an announcement, your work based on
112
+ the Program is not required to print an announcement.)
113
+
114
+ These requirements apply to the modified work as a whole. If
115
+ identifiable sections of that work are not derived from the Program,
116
+ and can be reasonably considered independent and separate works in
117
+ themselves, then this License, and its terms, do not apply to those
118
+ sections when you distribute them as separate works. But when you
119
+ distribute the same sections as part of a whole which is a work based
120
+ on the Program, the distribution of the whole must be on the terms of
121
+ this License, whose permissions for other licensees extend to the
122
+ entire whole, and thus to each and every part regardless of who wrote it.
123
+
124
+ Thus, it is not the intent of this section to claim rights or contest
125
+ your rights to work written entirely by you; rather, the intent is to
126
+ exercise the right to control the distribution of derivative or
127
+ collective works based on the Program.
128
+
129
+ In addition, mere aggregation of another work not based on the Program
130
+ with the Program (or with a work based on the Program) on a volume of
131
+ a storage or distribution medium does not bring the other work under
132
+ the scope of this License.
133
+
134
+ 3. You may copy and distribute the Program (or a work based on it,
135
+ under Section 2) in object code or executable form under the terms of
136
+ Sections 1 and 2 above provided that you also do one of the following:
137
+
138
+ a) Accompany it with the complete corresponding machine-readable
139
+ source code, which must be distributed under the terms of Sections
140
+ 1 and 2 above on a medium customarily used for software interchange; or,
141
+
142
+ b) Accompany it with a written offer, valid for at least three
143
+ years, to give any third party, for a charge no more than your
144
+ cost of physically performing source distribution, a complete
145
+ machine-readable copy of the corresponding source code, to be
146
+ distributed under the terms of Sections 1 and 2 above on a medium
147
+ customarily used for software interchange; or,
148
+
149
+ c) Accompany it with the information you received as to the offer
150
+ to distribute corresponding source code. (This alternative is
151
+ allowed only for noncommercial distribution and only if you
152
+ received the program in object code or executable form with such
153
+ an offer, in accord with Subsection b above.)
154
+
155
+ The source code for a work means the preferred form of the work for
156
+ making modifications to it. For an executable work, complete source
157
+ code means all the source code for all modules it contains, plus any
158
+ associated interface definition files, plus the scripts used to
159
+ control compilation and installation of the executable. However, as a
160
+ special exception, the source code distributed need not include
161
+ anything that is normally distributed (in either source or binary
162
+ form) with the major components (compiler, kernel, and so on) of the
163
+ operating system on which the executable runs, unless that component
164
+ itself accompanies the executable.
165
+
166
+ If distribution of executable or object code is made by offering
167
+ access to copy from a designated place, then offering equivalent
168
+ access to copy the source code from the same place counts as
169
+ distribution of the source code, even though third parties are not
170
+ compelled to copy the source along with the object code.
171
+
172
+ 4. You may not copy, modify, sublicense, or distribute the Program
173
+ except as expressly provided under this License. Any attempt
174
+ otherwise to copy, modify, sublicense or distribute the Program is
175
+ void, and will automatically terminate your rights under this License.
176
+ However, parties who have received copies, or rights, from you under
177
+ this License will not have their licenses terminated so long as such
178
+ parties remain in full compliance.
179
+
180
+ 5. You are not required to accept this License, since you have not
181
+ signed it. However, nothing else grants you permission to modify or
182
+ distribute the Program or its derivative works. These actions are
183
+ prohibited by law if you do not accept this License. Therefore, by
184
+ modifying or distributing the Program (or any work based on the
185
+ Program), you indicate your acceptance of this License to do so, and
186
+ all its terms and conditions for copying, distributing or modifying
187
+ the Program or works based on it.
188
+
189
+ 6. Each time you redistribute the Program (or any work based on the
190
+ Program), the recipient automatically receives a license from the
191
+ original licensor to copy, distribute or modify the Program subject to
192
+ these terms and conditions. You may not impose any further
193
+ restrictions on the recipients' exercise of the rights granted herein.
194
+ You are not responsible for enforcing compliance by third parties to
195
+ this License.
196
+
197
+ 7. If, as a consequence of a court judgment or allegation of patent
198
+ infringement or for any other reason (not limited to patent issues),
199
+ conditions are imposed on you (whether by court order, agreement or
200
+ otherwise) that contradict the conditions of this License, they do not
201
+ excuse you from the conditions of this License. If you cannot
202
+ distribute so as to satisfy simultaneously your obligations under this
203
+ License and any other pertinent obligations, then as a consequence you
204
+ may not distribute the Program at all. For example, if a patent
205
+ license would not permit royalty-free redistribution of the Program by
206
+ all those who receive copies directly or indirectly through you, then
207
+ the only way you could satisfy both it and this License would be to
208
+ refrain entirely from distribution of the Program.
209
+
210
+ If any portion of this section is held invalid or unenforceable under
211
+ any particular circumstance, the balance of the section is intended to
212
+ apply and the section as a whole is intended to apply in other
213
+ circumstances.
214
+
215
+ It is not the purpose of this section to induce you to infringe any
216
+ patents or other property right claims or to contest validity of any
217
+ such claims; this section has the sole purpose of protecting the
218
+ integrity of the free software distribution system, which is
219
+ implemented by public license practices. Many people have made
220
+ generous contributions to the wide range of software distributed
221
+ through that system in reliance on consistent application of that
222
+ system; it is up to the author/donor to decide if he or she is willing
223
+ to distribute software through any other system and a licensee cannot
224
+ impose that choice.
225
+
226
+ This section is intended to make thoroughly clear what is believed to
227
+ be a consequence of the rest of this License.
228
+
229
+ 8. If the distribution and/or use of the Program is restricted in
230
+ certain countries either by patents or by copyrighted interfaces, the
231
+ original copyright holder who places the Program under this License
232
+ may add an explicit geographical distribution limitation excluding
233
+ those countries, so that distribution is permitted only in or among
234
+ countries not thus excluded. In such case, this License incorporates
235
+ the limitation as if written in the body of this License.
236
+
237
+ 9. The Free Software Foundation may publish revised and/or new versions
238
+ of the General Public License from time to time. Such new versions will
239
+ be similar in spirit to the present version, but may differ in detail to
240
+ address new problems or concerns.
241
+
242
+ Each version is given a distinguishing version number. If the Program
243
+ specifies a version number of this License which applies to it and "any
244
+ later version", you have the option of following the terms and conditions
245
+ either of that version or of any later version published by the Free
246
+ Software Foundation. If the Program does not specify a version number of
247
+ this License, you may choose any version ever published by the Free Software
248
+ Foundation.
249
+
250
+ 10. If you wish to incorporate parts of the Program into other free
251
+ programs whose distribution conditions are different, write to the author
252
+ to ask for permission. For software which is copyrighted by the Free
253
+ Software Foundation, write to the Free Software Foundation; we sometimes
254
+ make exceptions for this. Our decision will be guided by the two goals
255
+ of preserving the free status of all derivatives of our free software and
256
+ of promoting the sharing and reuse of software generally.
257
+
258
+ NO WARRANTY
259
+
260
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261
+ FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
262
+ OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263
+ PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264
+ OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
266
+ TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
267
+ PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268
+ REPAIR OR CORRECTION.
269
+
270
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272
+ REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273
+ INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274
+ OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275
+ TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276
+ YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277
+ PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278
+ POSSIBILITY OF SUCH DAMAGES.
279
+
280
+ END OF TERMS AND CONDITIONS
281
+
282
+ How to Apply These Terms to Your New Programs
283
+
284
+ If you develop a new program, and you want it to be of the greatest
285
+ possible use to the public, the best way to achieve this is to make it
286
+ free software which everyone can redistribute and change under these terms.
287
+
288
+ To do so, attach the following notices to the program. It is safest
289
+ to attach them to the start of each source file to most effectively
290
+ convey the exclusion of warranty; and each file should have at least
291
+ the "copyright" line and a pointer to where the full notice is found.
292
+
293
+ <one line to give the program's name and a brief idea of what it does.>
294
+ Copyright (C) <year> <name of author>
295
+
296
+ This program is free software; you can redistribute it and/or modify
297
+ it under the terms of the GNU General Public License as published by
298
+ the Free Software Foundation; either version 2 of the License, or
299
+ (at your option) any later version.
300
+
301
+ This program is distributed in the hope that it will be useful,
302
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
303
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
304
+ GNU General Public License for more details.
305
+
306
+ You should have received a copy of the GNU General Public License along
307
+ with this program; if not, write to the Free Software Foundation, Inc.,
308
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309
+
310
+ Also add information on how to contact you by electronic and paper mail.
311
+
312
+ If the program is interactive, make it output a short notice like this
313
+ when it starts in an interactive mode:
314
+
315
+ Gnomovision version 69, Copyright (C) year name of author
316
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317
+ This is free software, and you are welcome to redistribute it
318
+ under certain conditions; type `show c' for details.
319
+
320
+ The hypothetical commands `show w' and `show c' should show the appropriate
321
+ parts of the General Public License. Of course, the commands you use may
322
+ be called something other than `show w' and `show c'; they could even be
323
+ mouse-clicks or menu items--whatever suits your program.
324
+
325
+ You should also get your employer (if you work as a programmer) or your
326
+ school, if any, to sign a "copyright disclaimer" for the program, if
327
+ necessary. Here is a sample; alter the names:
328
+
329
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
331
+
332
+ <signature of Ty Coon>, 1 April 1989
333
+ Ty Coon, President of Vice
334
+
335
+ This General Public License does not permit incorporating your program into
336
+ proprietary programs. If your program is a subroutine library, you may
337
+ consider it more useful to permit linking proprietary applications with the
338
+ library. If this is what you want to do, use the GNU Lesser General
339
+ Public License instead of this License.
lng/L2SCA/stanford-parser-full-2014-01-04/Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is a rudimentary Makefile for rebuilding the parser.
2
+ # We actually use ant (q.v.) or a Java IDE.
3
+
4
+ JAVAC = javac
5
+ JAVAFLAGS = -O -d classes -encoding utf-8
6
+
7
+ parser:
8
+ mkdir -p classes
9
+ $(JAVAC) $(JAVAFLAGS) src/edu/stanford/nlp/*/*.java \
10
+ src/edu/stanford/nlp/*/*/*.java src/edu/stanford/nlp/*/*/*/*.java
11
+ cd classes ; jar -cfm ../stanford-parser-`date +%Y-%m-%d`.jar ../src/edu/stanford/nlp/parser/lexparser/lexparser-manifest.txt edu ; cd ..
12
+ cp stanford-parser-`date +%Y-%m-%d`.jar stanford-parser.jar
13
+ rm -rf classes
lng/L2SCA/stanford-parser-full-2014-01-04/ParserDemo.java ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import java.util.Collection;
3
+ import java.util.List;
4
+ import java.io.StringReader;
5
+
6
+ import edu.stanford.nlp.process.Tokenizer;
7
+ import edu.stanford.nlp.process.TokenizerFactory;
8
+ import edu.stanford.nlp.process.CoreLabelTokenFactory;
9
+ import edu.stanford.nlp.process.DocumentPreprocessor;
10
+ import edu.stanford.nlp.process.PTBTokenizer;
11
+ import edu.stanford.nlp.ling.CoreLabel;
12
+ import edu.stanford.nlp.ling.HasWord;
13
+ import edu.stanford.nlp.ling.Sentence;
14
+ import edu.stanford.nlp.trees.*;
15
+ import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
16
+
17
+ class ParserDemo {
18
+
19
+ /**
20
+ * The main method demonstrates the easiest way to load a parser.
21
+ * Simply call loadModel and specify the path of a serialized grammar
22
+ * model, which can be a file, a resource on the classpath, or even a URL.
23
+ * For example, this demonstrates loading from the models jar file, which
24
+ * you therefore need to include in the classpath for ParserDemo to work.
25
+ */
26
+ public static void main(String[] args) {
27
+ LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
28
+ if (args.length > 0) {
29
+ demoDP(lp, args[0]);
30
+ } else {
31
+ demoAPI(lp);
32
+ }
33
+ }
34
+
35
+ /**
36
+ * demoDP demonstrates turning a file into tokens and then parse
37
+ * trees. Note that the trees are printed by calling pennPrint on
38
+ * the Tree object. It is also possible to pass a PrintWriter to
39
+ * pennPrint if you want to capture the output.
40
+ */
41
+ public static void demoDP(LexicalizedParser lp, String filename) {
42
+ // This option shows loading, sentence-segmenting and tokenizing
43
+ // a file using DocumentPreprocessor.
44
+ TreebankLanguagePack tlp = new PennTreebankLanguagePack();
45
+ GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
46
+ // You could also create a tokenizer here (as below) and pass it
47
+ // to DocumentPreprocessor
48
+ for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
49
+ Tree parse = lp.apply(sentence);
50
+ parse.pennPrint();
51
+ System.out.println();
52
+
53
+ GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
54
+ Collection tdl = gs.typedDependenciesCCprocessed();
55
+ System.out.println(tdl);
56
+ System.out.println();
57
+ }
58
+ }
59
+
60
+ /**
61
+ * demoAPI demonstrates other ways of calling the parser with
62
+ * already tokenized text, or in some cases, raw text that needs to
63
+ * be tokenized as a single sentence. Output is handled with a
64
+ * TreePrint object. Note that the options used when creating the
65
+ * TreePrint can determine what results to print out. Once again,
66
+ * one can capture the output by passing a PrintWriter to
67
+ * TreePrint.printTree.
68
+ */
69
+ public static void demoAPI(LexicalizedParser lp) {
70
+ // This option shows parsing a list of correctly tokenized words
71
+ String[] sent = { "This", "is", "an", "easy", "sentence", "." };
72
+ List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
73
+ Tree parse = lp.apply(rawWords);
74
+ parse.pennPrint();
75
+ System.out.println();
76
+
77
+ // This option shows loading and using an explicit tokenizer
78
+ String sent2 = "This is another sentence.";
79
+ TokenizerFactory<CoreLabel> tokenizerFactory =
80
+ PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
81
+ Tokenizer<CoreLabel> tok =
82
+ tokenizerFactory.getTokenizer(new StringReader(sent2));
83
+ List<CoreLabel> rawWords2 = tok.tokenize();
84
+ parse = lp.apply(rawWords2);
85
+
86
+ TreebankLanguagePack tlp = new PennTreebankLanguagePack();
87
+ GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
88
+ GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
89
+ List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
90
+ System.out.println(tdl);
91
+ System.out.println();
92
+
93
+ // You can also use a TreePrint object to print trees and dependencies
94
+ TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
95
+ tp.printTree(parse);
96
+ }
97
+
98
+ private ParserDemo() {} // static methods only
99
+
100
+ }
lng/L2SCA/stanford-parser-full-2014-01-04/ParserDemo2.java ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import java.io.IOException;
3
+ import java.io.StringReader;
4
+ import java.util.*;
5
+
6
+ import edu.stanford.nlp.ling.CoreLabel;
7
+ import edu.stanford.nlp.ling.HasWord;
8
+ import edu.stanford.nlp.ling.Label;
9
+ import edu.stanford.nlp.ling.Word;
10
+ import edu.stanford.nlp.process.DocumentPreprocessor;
11
+ import edu.stanford.nlp.process.Tokenizer;
12
+ import edu.stanford.nlp.trees.*;
13
+ import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
14
+
15
+ class ParserDemo2 {
16
+
17
+ /** This example shows a few more ways of providing input to a parser.
18
+ *
19
+ * Usage: ParserDemo2 [grammar [textFile]]
20
+ */
21
+ public static void main(String[] args) throws IOException {
22
+ String grammar = args.length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
23
+ String[] options = { "-maxLength", "80", "-retainTmpSubcategories" };
24
+ LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
25
+ TreebankLanguagePack tlp = lp.getOp().langpack();
26
+ GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
27
+
28
+ Iterable<List<? extends HasWord>> sentences;
29
+ if (args.length > 1) {
30
+ DocumentPreprocessor dp = new DocumentPreprocessor(args[1]);
31
+ List<List<? extends HasWord>> tmp =
32
+ new ArrayList<List<? extends HasWord>>();
33
+ for (List<HasWord> sentence : dp) {
34
+ tmp.add(sentence);
35
+ }
36
+ sentences = tmp;
37
+ } else {
38
+ // Showing tokenization and parsing in code a couple of different ways.
39
+ String[] sent = { "This", "is", "an", "easy", "sentence", "." };
40
+ List<HasWord> sentence = new ArrayList<HasWord>();
41
+ for (String word : sent) {
42
+ sentence.add(new Word(word));
43
+ }
44
+ String sent2 = ("This is a slightly longer and more complex " +
45
+ "sentence requiring tokenization.");
46
+ // Use the default tokenizer for this TreebankLanguagePack
47
+ Tokenizer<? extends HasWord> toke =
48
+ tlp.getTokenizerFactory().getTokenizer(new StringReader(sent2));
49
+ List<? extends HasWord> sentence2 = toke.tokenize();
50
+ List<List<? extends HasWord>> tmp =
51
+ new ArrayList<List<? extends HasWord>>();
52
+ tmp.add(sentence);
53
+ tmp.add(sentence2);
54
+ sentences = tmp;
55
+ }
56
+
57
+ for (List<? extends HasWord> sentence : sentences) {
58
+ Tree parse = lp.parse(sentence);
59
+ parse.pennPrint();
60
+ System.out.println();
61
+ GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
62
+ List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
63
+ System.out.println(tdl);
64
+ System.out.println();
65
+
66
+ System.out.println("The words of the sentence:");
67
+ for (Label lab : parse.yield()) {
68
+ if (lab instanceof CoreLabel) {
69
+ System.out.println(((CoreLabel) lab).toString("{map}"));
70
+ } else {
71
+ System.out.println(lab);
72
+ }
73
+ }
74
+ System.out.println();
75
+ System.out.println(parse.taggedYield());
76
+ System.out.println();
77
+
78
+ }
79
+
80
+ // This method turns the String into a single sentence using the
81
+ // default tokenizer for the TreebankLanguagePack.
82
+ String sent3 = "This is one last test!";
83
+ lp.parse(sent3).pennPrint();
84
+ }
85
+
86
+ private ParserDemo2() {} // static methods only
87
+
88
+ }
lng/L2SCA/stanford-parser-full-2014-01-04/README.txt ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Stanford Lexicalized Parser v3.3.1 - 2014-01-04
2
+ -----------------------------------------------
3
+
4
+ Copyright (c) 2002-2012 The Board of Trustees of The Leland Stanford Junior
5
+ University. All Rights Reserved.
6
+
7
+ Original core parser code by Dan Klein. Support code, additional
8
+ modules, languages, features, internationalization, compaction, typed
9
+ dependencies, etc. by Christopher Manning, Roger Levy, Teg Grenager,
10
+ Galen Andrew, Marie-Catherine de Marneffe, Jenny Finkel, Spence Green,
11
+ Bill MacCartney, Anna Rafferty, Huihsin Tseng, Pi-Chuan Chang,
12
+ Wolfgang Maier, Richard Eckart, Richard Socher, and John Bauer.
13
+
14
+ This release prepared by John Bauer.
15
+
16
+ This package contains 4 parsers: a high-accuracy unlexicalized PCFG; a
17
+ lexicalized dependency parser; a factored model, where the estimates
18
+ of dependencies and an unlexicalized PCFG are jointly optimized to
19
+ give a lexicalized PCFG treebank parser; and an RNN parser, where
20
+ recursive neural networks trained with semantic word vectors are used
21
+ to score parse trees. Also included are grammars for various
22
+ languages for use with these parsers.
23
+
24
+ For more information about the parser API, point a web browser at the
25
+ included javadoc directory (use the browser's Open File command to open
26
+ the index.html file inside the javadoc folder). Start by looking at the
27
+ Package page for the edu.stanford.nlp.parser.lexparser package, and then
28
+ look at the page for the LexicalizedParser class documentation therein,
29
+ particularly documentation of the main method.
30
+
31
+ Secondly, you should also look at the Parser FAQ on the web:
32
+
33
+ http://nlp.stanford.edu/software/parser-faq.shtml
34
+
35
+ This software requires Java 6 (JDK 1.6.0+). (You must have installed it
36
+ separately. Check that the command "java -version" works and gives 1.6+.)
37
+
38
+
39
+ QUICKSTART
40
+
41
+ UNIX COMMAND-LINE USAGE
42
+
43
+ On a Unix system you should be able to parse the English test file with the
44
+ following command:
45
+
46
+ ./lexparser.sh data/testsent.txt
47
+
48
+ This uses the PCFG parser, which is quick to load and run, and quite accurate.
49
+
50
+ [Notes: it takes a few seconds to load the parser data before parsing
51
+ begins; continued parsing is quicker. To use the lexicalized parser, replace
52
+ englishPCFG.ser.gz with englishFactored.ser.gz in the lexparser.sh script
53
+ and use the flag -mx600m to give more memory to java.]
54
+
55
+ WINDOWS GUI USAGE
56
+
57
+ On a Windows system, assuming that java is on your PATH, you should be able
58
+ to run a parsing GUI by double-clicking on the lexparser-gui.bat icon,
59
+ or giving the command lexparser-gui in this directory from a command prompt.
60
+
61
+ Click Load File, Browse, and navigate to and select testsent.txt in
62
+ the top directory of the parser distribution. Click Load Parser,
63
+ Browse, and select the models jar, also in the top directory of the
64
+ parser distribution. From the models jar, select englishPCFG.ser.gz.
65
+ Click Parse to parse the first sentence.
66
+
67
+ OTHER USE CASES
68
+
69
+ The GUI is also available under Unix:
70
+
71
+ lexparser-gui.sh
72
+
73
+ Under Mac OS X, you can double-click on lexparser-gui.command to invoke the
74
+ GUI. The command-line version works on all platforms. Use lexparser.bat
75
+ to run it under Windows. The GUI is only for exploring the parser. It does
76
+ not allow you to save output. You need to use the command-line program or
77
+ programmatic API to do serious work with the parser.
78
+
79
+ ADDITIONAL GRAMMARS
80
+
81
+ The parser is supplied with several trained grammars. There are English
82
+ grammars based on the standard LDC Penn Treebank WSJ training sections 2-21
83
+ (wsj*), and ones based on an augmented data set, better for questions,
84
+ commands, and recent English and biomedical text (english*).
85
+
86
+ All grammars are located in the included models jar. (If you'd like to have
87
+ grammar files like in older versions of the parser, you can get them by
88
+ extracting them from the jar file with the 'jar -xf' command.)
89
+
90
+ MULTILINGUAL PARSING
91
+ In addition to the English grammars, the parser comes with trained grammars
92
+ for Arabic, Chinese, French, and German. To parse with these grammars, run
93
+
94
+ lexparser-lang.sh
95
+
96
+ with no arguments to see usage instructions. You can change language-specific
97
+ settings passed to the parser by modifying lexparser_lang.def.
98
+
99
+ You can also train and evaluate new grammars using:
100
+
101
+ lexparser-lang-train-test.sh
102
+
103
+ To see how we trained the grammars supplied in this distribution, see
104
+
105
+ bin/makeSerialized.csh
106
+
107
+ You will not be able to run this script (since it uses Stanford-specific file
108
+ paths), but you should be able to see what we did.
109
+
110
+ Arabic
111
+ Trained on parts 1-3 of the Penn Arabic Treebank (ATB) using the
112
+ pre-processing described in (Green and Manning, 2010). The default input
113
+ encoding is UTF-8 Arabic script. You can convert text in Buckwalter encoding to UTF-8
114
+ with the package edu.stanford.nlp.international.arabic.Buckwalter which is included
115
+ in stanford-parser.jar.
116
+
117
+ The parser *requires* segmentation and tokenization of raw text per the ATB standard
118
+ prior to parsing. You can generate this segmentation and tokenization with the Stanford
119
+ Word Segmenter, which is available separately at:
120
+
121
+ http://nlp.stanford.edu/software/segmenter.shtml
122
+
123
+ Chinese
124
+ There are Chinese grammars trained just on mainland material from
125
+ Xinhua and more mixed material from the LDC Chinese Treebank. The default
126
+ input encoding is GB18030.
127
+
128
+ French
129
+ Trained on the functionally annotated section of the French Treebank
130
+ (FTB) using the pre-processing described in (Green et al., 2011). For raw text input,
131
+ a tokenizer is enabled by default that produces FTB tokenization. To disable this
132
+ tokenizer, use the "-tokenized" option. To tokenize raw text separately, see
133
+ the usage information in edu.stanford.nlp.international.french.process.FrenchTokenizer.
134
+
135
+ German
136
+ Trained on the Negra corpus. Details are included in (Rafferty and
137
+ Manning, 2008).
138
+
139
+ TREEBANK PREPROCESSING
140
+
141
+ The pre-processed versions of the ATB described
142
+ in (Green and Manning, 2010) and the FTB described in (Green et al.,
143
+ 2011) can be reproduced using the TreebankPreprocessor included in this
144
+ release. The configuration files are located in /conf. For example,
145
+ to create the ATB data, run:
146
+
147
+ bin/run-tb-preproc -v conf/atb-latest.conf
148
+
149
+ Note that you'll need to update the conf file paths to your local treebank
150
+ distributions as the data is not distributed with the parser. You'll
151
+ also need to set the classpath in the cmd_line variable of run-tb-preproc.
152
+
153
+ The TreebankPreprocessor conf files support various options, which are
154
+ documented in
155
+
156
+ edu.stanford.nlp.international.process.ConfigParser
157
+
158
+ EVALUATION METRICS
159
+
160
+ The Stanford parser comes with Java implementations of the following
161
+ evaluation metrics:
162
+
163
+ Dependency Labeled Attachment
164
+
165
+ Evalb (Collins, 1997)
166
+ -Includes per-category evaluation with the -c option
167
+
168
+ Leaf Ancestor (Sampson and Babarczy, 2003)
169
+ -Both micro- and macro-averaged score
170
+
171
+ Tagging Accuracy
172
+
173
+ See the usage instructions and javadocs in the requisite classes located in
174
+ edu.stanford.nlp.parser.metrics.
175
+
176
+ LICENSE
177
+
178
+ // StanfordLexicalizedParser -- a probabilistic lexicalized NL CFG parser
179
+ // Copyright (c) 2002-2012 The Board of Trustees of
180
+ // The Leland Stanford Junior University. All Rights Reserved.
181
+ //
182
+ // This program is free software; you can redistribute it and/or
183
+ // modify it under the terms of the GNU General Public License
184
+ // as published by the Free Software Foundation; either version 2
185
+ // of the License, or (at your option) any later version.
186
+ //
187
+ // This program is distributed in the hope that it will be useful,
188
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
189
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
190
+ // GNU General Public License for more details.
191
+ //
192
+ // You should have received a copy of the GNU General Public License
193
+ // along with this program; if not, write to the Free Software
194
+ // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
195
+ //
196
+ // For more information, bug reports, fixes, contact:
197
+ // Christopher Manning
198
+ // Dept of Computer Science, Gates 1A
199
+ // Stanford CA 94305-9010
200
+ // USA
201
202
+ // http://nlp.stanford.edu/downloads/lex-parser.shtml
203
+
204
+
205
+ ---------------------------------
206
+ CHANGES
207
+ ---------------------------------
208
+
209
+ 2014-01-04 3.3.1 Bugfix release, dependency improvements
210
+
211
+ 2013-11-12 3.3.0 Remove the attr dependency, add imperatives to
212
+ English training data
213
+
214
+ 2013-06-19 3.2.0 New RNN model for WSJ and English with
215
+ improved test set accuracy, rel dependency
216
+ removed
217
+
218
+ 2013-04-05 2.0.5 Dependency improvements, ctb7 model, -nthreads
219
+ option
220
+
221
+ 2012-11-12 2.0.4 Dependency speed improvements; other
222
+ dependency changes
223
+
224
+ 2012-07-09 2.0.3 Minor bug fixes
225
+
226
+ 2012-05-22 2.0.2 Supports adding extra data in non-tree format
227
+
228
+ 2012-03-09 2.0.1 Caseless English model added, ready for maven
229
+
230
+ 2012-01-11 2.0.0 Threadsafe!
231
+
232
+ 2011-09-14 1.6.9 Added some imperatives to the English
233
+ training data; added root dependency.
234
+
235
+ 2011-06-15 1.6.8 Added French parser and leaf ancestor
236
+ evaluation metric; reorganized distribution;
237
+ new data preparation scripts; rebuilt grammar
238
+ models; other bug fixes
239
+
240
+ 2011-05-15 1.6.7 Minor bug fixes
241
+
242
+ 2011-04-17 1.6.6 Compatible with tagger, corenlp and tregex.
243
+
244
+ 2010-10-30 1.6.5 Further improvements to English Stanford
245
+ Dependencies and other minor changes
246
+
247
+ 2010-08-16 1.6.4 More minor bug fixes and improvements to English
248
+ Stanford Dependencies and question parsing
249
+
250
+ 2010-07-09 1.6.3 Improvements to English Stanford Dependencies and
251
+ question parsing, minor bug fixes
252
+
253
+ 2010-02-25 1.6.2 Improvements to Arabic parser models,
254
+ and to English and Chinese Stanford Dependencies
255
+
256
+ 2008-10-19 1.6.1 Slightly improved Arabic, German and
257
+ Stanford Dependencies
258
+
259
+ 2007-08-18 1.6 Added Arabic, k-best PCCFG parsing;
260
+ improved English grammatical relations
261
+
262
+ 2006-05-30 1.5.1 Improved English and Chinese grammatical relations;
263
+ fixed UTF-8 handling
264
+
265
+ 2005-07-20 1.5 Added grammatical relations output;
266
+ fixed bugs introduced in 1.4
267
+
268
+ 2004-03-24 1.4 Made PCFG faster again (by FSA minimization);
269
+ added German support
270
+
271
+ 2003-09-06 1.3 Made parser over twice as fast;
272
+ added tokenization options
273
+
274
+ 2003-07-20 1.2 Halved PCFG memory usage;
275
+ added support for Chinese
276
+
277
+ 2003-03-25 1.1 Improved parsing speed; included GUI,
278
+ improved PCFG grammar
279
+
280
+ 2002-12-05 1.0 Initial release
lng/L2SCA/stanford-parser-full-2014-01-04/README_dependencies.txt ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ STANFORD DEPENDENCIES. Stanford Parser v3.3.1
2
+ -----------------------------------------------------------
3
+
4
+ The manual for the English version of the Stanford Dependencies
5
+ representation:
6
+
7
+ StanfordDependenciesManual.pdf
8
+
9
+ should be consulted for the current set of dependency representations
10
+ and the correct commands for generating Stanford Dependencies together
11
+ with any of the Stanford Parser, another parser, or a treebank.
12
+
13
+ A typed dependencies representation is also available for Chinese. For
14
+ the moment the documentation consists of the code, and a brief
15
+ presentation in this paper:
16
+
17
+ Pi-Chuan Chang, Huihsin Tseng, Dan Jurafsky, and Christopher
18
+ D. Manning. 2009. Discriminative Reordering with Chinese Grammatical
19
+ Relations Features. Third Workshop on Syntax and Structure in Statistical
20
+ Translation.
21
+
22
+
23
+ --------------------------------------
24
+ ORIGINAL DEPENDENCIES SCHEME
25
+
26
+ For an overview of the original typed dependencies scheme, please look
27
+ at:
28
+
29
+ Marie-Catherine de Marneffe, Bill MacCartney, and Christopher D.
30
+ Manning. 2006. Generating Typed Dependency Parses from Phrase
31
+ Structure Parses. 5th International Conference on Language Resources
32
+ and Evaluation (LREC 2006).
33
+ http://nlp.stanford.edu/~manning/papers/LREC_2.pdf
34
+
35
+ For more discussion of the design principles, please see:
36
+
37
+ Marie-Catherine de Marneffe and Christopher D. Manning. 2008. The
38
+ Stanford typed dependencies representation. In Proceedings of the
39
+ workshop on Cross-Framework and Cross-Domain Parser Evaluation, pp. 1-8.
40
+ http://nlp.stanford.edu/~manning/papers/dependencies-coling08.pdf
41
+
42
+ These papers can be cited as references for the English Stanford
43
+ Dependencies.
44
+
45
+
46
+ --------------------------------------
47
+ CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- v3.3.1
48
+
49
+ A couple of fixes/improvements were made in the dependency conversion,
50
+ and one change was made to the taxonomy of relations.
51
+
52
+ - The partmod and infmod relations were deleted, and replaced with
53
+ vmod for reduced, non-finite verbal modifiers. The distinction between
54
+ these two relations can be recovered from the POS tag of the dependent.
55
+ - A couple of improvements were made to the conversion, the largest
56
+ one being recognizing pobj inside a PP not headed by something tagged
57
+ as IN or TO.
58
+
59
+
60
+ --------------------------------------
61
+ CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- v3.3
62
+
63
+ Some fixes/improvements were made in the dependency conversion, and one
64
+ change was made to the taxonomy of relations.
65
+
66
+ - For currency amount expressions with a currency symbol like "$", it
67
+ had previously been the case that "$" was the head, and then each
68
+ number word modified it as a number. We realized that this was
69
+ unnecessarily inconsistent. For the expression "two thousand dollars",
70
+ "dollars" is the head, but "thousand" is a num modifier of it, and
71
+ number is used for the parts of a number multi-word expression only.
72
+ This analysis is now also used for cases with a currency symbol. E.g.,
73
+ "for $ 52.7 million": prep(for, $) num($, million) number(million, 52.7).
74
+ Similarly, for "the $ 2.29 billion value", we changed the analysis from
75
+ num(value, $) number($, billion) to amod(value, $) num($, billion).
76
+ This corresponds to hwat you got for "a two dollar value".
77
+ This is actually the most common change (at least on WSJ newswire!).
78
+ - Remove the attr relation. Some cases disappear by making the question
79
+ phrase of WHNP be NP questions the root. Others (predicative NP
80
+ complements) become xcomp.
81
+ - Less aggressive labeling of participial form VPs as xcomp. More of them
82
+ are correctly labeled partmod (but occasionally a true xcomp is also
83
+ mislabeled as partmod).
84
+ - Small rule changes to recognize a few more ccomp and parataxis.
85
+
86
+
87
+ --------------------------------------
88
+ CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- v3.2, JUNE 2013
89
+
90
+ Various small fixes were made to the dependencies conversion,
91
+ and one change to the taxonomy of relations:
92
+ - rel was removed. rel was originally used as the relation for an
93
+ overt relativizer in a relative clause. But it was never a real
94
+ grammatical relation, and we gradually started labeling easy cases
95
+ as nsubj or dobj. In this release, rel is removed, pobj cases are
96
+ also labeled, and the remaining hard cases are labeled as dep.
97
+
98
+ --------------------------------------
99
+ CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- v2.0.5, MARCH 2013
100
+
101
+ We have begun a more major effort to improve the suitability and coverage of
102
+ Stanford Dependencies on less formal text types, and to clean up a couple of
103
+ the more quirky dependencies in the original set. These changes are still
104
+ ongoing, but in this first installment, we have removed 3 dependencies and
105
+ added 2:
106
+ - abbrev was removed, and is now viewed as just a case of appos.
107
+ - complm was removed, and is now viewed as just a case of mark.
108
+ (This is consistent with an HPSG-like usage of mark.)
109
+ - purpcl was removed, and is now viewed as just a case of advcl.
110
+ - discourse was added. The lack of a dependency type for
111
+ interjections was an omission even in the early versions, but it
112
+ became essential as we expanded our consideration of informal
113
+ text types. It is used for interjections, fillers, discourse markers
114
+ and emoticons.
115
+ - goeswith was added. In badly edited text, it is used to join the
116
+ two parts of a word.
117
+
118
+ A few other changes and improvements were also made, including improvements
119
+ in the recognition of advcl. There has been a reduction of "dep" dependencies
120
+ of about 14% on newswire (and higher on more informal text genres).
121
+
122
+
123
+ --------------------------------------
124
+ CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- v2.0.4, NOVEMBER 2012
125
+
126
+ A few minor changes and fixes were made: HYPH is now recognized, and treated
127
+ as punctuation and clausal complements of adjectives (including comparatives)
128
+ are recognized as ccomp.
129
+
130
+ --------------------------------------
131
+
132
+ CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- v1.6.9
133
+
134
+ This version adds an explicit root dependency in the set of dependencies
135
+ returned. In the past, there had been no explicit representation of the
136
+ root of the sentence in the set of dependencies returned, except in the
137
+ CoNLL format output, which always showed the root. Now, there is always
138
+ an explicit extra dependency that marks the sentence root, using a fake
139
+ ROOT pseudoword with index 0. That is, the root is marked in this way:
140
+ root(ROOT-0, depends-3)
141
+ Otherwise there were only a couple of minute changes in the dependencies
142
+ produced (appositions are now recognized in WHNPs!).
143
+
144
+ --------------------------------------
145
+ CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- v1.6.8
146
+
147
+ This version includes only small fixes, principally addressing some gaps
148
+ in the correct treatment of dependencies in inverted sentence (SQ and SINV)
149
+ constructions, and some errors in the treatment of copulas in the presence of
150
+ temporal NPs.
151
+
152
+
153
+ --------------------------------------
154
+ CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- NOVEMBER 2010 - JANUARY 2011
155
+
156
+ Two changes were made to the taxonomy of dependencies.
157
+ - measure (phrase modifier) was generalized and replaced by
158
+ npadvmod (noun phrase adverbial modifier) which includes measure
159
+ phrases and other adverbial uses of noun phrases. Temporal NPs
160
+ (tmod) are now a subtype of npadvmod in the dependency hierarchy.
161
+ - mwe (multi-word expression) is introduced for certain common
162
+ function word dependencies for which another good analysis isn't
163
+ easy to come by (and which were frequently dep before) such as
164
+ "instead of" or "rather than".
165
+
166
+ A new option has ben added to allow the copula to be treated as
167
+ the head when it has an adjective or noun complement.
168
+
169
+ The conversion software will now work fairly well with the
170
+ David Vadas version of the treebank with extra noun phrase
171
+ structure. (A few rare cases that are handled with the standard
172
+ treebank aren't yet handled, but you will get better dependencies
173
+ for compound nouns and multiword adjectival modifiers, etc.)
174
+
175
+ Considerable improvements were made in the coverage of named
176
+ dependencies. You should expect to see only about half as many generic
177
+ "dep" dependencies as in version 1.6.4.
178
+
179
+ --------------------------------------
180
+ CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- JUNE-AUGUST 2010
181
+
182
+ No new dependency relations have been introduced.
183
+
184
+ There have been some significant improvements in the generated
185
+ dependencies, principally covering:
186
+ - Better resolution of nsubj and dobj long distance dependencies
187
+ (but v1.6.4 fixes the overpercolation of dobj in v1.6.3)
188
+ - Better handling of conjunction distribution in CCprocessed option
189
+ - Correction of bug in v1.6.2 that made certain verb dependents noun
190
+ dependents.
191
+ - Better dependencies are generated for question structures (v1.6.4)
192
+ - Other minor improvements in recognizing passives, adverbial
193
+ modifiers, etc.
194
+
lng/L2SCA/stanford-parser-full-2014-01-04/StanfordDependenciesManual.pdf ADDED
Binary file (307 kB). View file
 
lng/L2SCA/stanford-parser-full-2014-01-04/bin/makeSerialized.csh ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/csh -f
2
+
3
+ # This is the file we use to make the serialized grammars for the parser.
4
+ # If you are on the Stanford NLP machines, you can use it to remake the
5
+ # serialized parsers (such as when there have been incompatible software
6
+ # changes). Don't forget to klog first so you can access the AFS corpora.
7
+ #
8
+ # If you are not on the Stanford NLP machines, then the script won't work
9
+ # for you as is, since it contains hard-coded paths to various treebanks.
10
+ # But it may still be useful to inspect it to see what options we used to
11
+ # generate the various supplied grammars.
12
+ #
13
+ # NOTE: Output files in this script should ALWAYS use relative paths, so
14
+ # that you can copy this script and run it in a different directory and
15
+ # it will write output files there.
16
+ #
17
+ # usage:
18
+ # cd /u/nlp/data/lexparser # to have files output in "usual" location
19
+ # ./makeSerialized.csh
20
+ #
21
+ ## Uncomment this bit to run it with older parser version
22
+ # setenv CLASSPATH /u/nlp/distrib/lexparser-2004-03-24/javanlp.jar:
23
+
24
+ if ( ! $?JAVANLP_HOME) then
25
+ echo 'JAVANLP_HOME is not set'
26
+ echo 'Add a line like setenv JAVANLP_HOME $HOME/javanlp to your environment'
27
+ exit
28
+ endif
29
+
30
+ set wsjptb=/afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj
31
+ # now ctb6
32
+ set ctb=/afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed
33
+ # now ctb7!
34
+ set ctb7train=/u/nlp/data/chinese/ctb7/train.mrg
35
+ set ctb7test=/u/nlp/data/chinese/ctb7/test.mrg
36
+ set negra=/afs/ir/data/linguistic-data/NEGRA/penn-format-train-dev-test
37
+
38
+ set host=`hostname | cut -d. -f1`
39
+
40
+ if ( ! -r $wsjptb) then
41
+ echo "Can't read WSJ PTB. Maybe you forgot to klog??"
42
+ exit
43
+ endif
44
+
45
+ mv -f serializedParsers.log serializedParsers.bak
46
+ uptime > serializedParsers.log
47
+ echo "Classpath is $CLASSPATH" >> serializedParsers.log
48
+
49
+ # English WSJ 2-21 PCFG binary and text grammars
50
+
51
+ ( echo "Running wsjPCFG (goodPCFG) on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -goodPCFG -saveToSerializedFile wsjPCFG.ser.gz -saveToTextFile wsjPCFG.txt -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
52
+
53
+ # English noTagSplit no rule compaction PCFG text grammar
54
+ ( echo "Running wsjPCFG-noTagSplit-noCompact on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -goodPCFG -noTagSplit -saveToTextFile wsjPCFG-noTagSplit.txt -compactGrammar 0 -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
55
+
56
+ # English WSJ 2-21 Factored binary
57
+
58
+ ## Not yet clear that goodFactored is better than -ijcai03 -- not on dev set
59
+ # ( echo "Running wsjFactored (goodFactored) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDAtsv" -goodFactored -saveToSerializedFile wsjFactored.ser.gz -saveToTextFile wsjFactored.txt -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
60
+ ( echo "Running wsjFactored (ijcai03 correctTags) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -ijcai03 -v -printStates -compactGrammar 0 -correctTags -saveToSerializedFile wsjFactored.ser.gz -saveToTextFile wsjFactored.txt -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
61
+ ( echo "Running wsjFactored (ijcai03 replication) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -ijcai03 -v -printStates -compactGrammar 0 -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
62
+
63
+
64
+ ## "General English" models
65
+
66
+ # english{Factored|PCFG} is currently trained on:
67
+ # - WSJ sections 1-21
68
+ # - Genia as reformatted by Andrew Clegg, his training split
69
+ # - 2 English Chinese Translation Treebank and 3 English Arabic Translation
70
+ # Treebank files backported to the original treebank annotation standards
71
+ # (by us)
72
+ # - 95 sentences parsed by us (mainly questions and imperatives; a few from
73
+ # recent newswire).
74
+
75
+ # /u/nlp/data/genia/sentences_cleaned.tree
76
+
77
+ # "General English" Factored binary
78
+
79
+
80
+
81
+ ( echo "Running englishFactored (from treebank) on $host server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -ijcai03 -saveToSerializedFile englishFactored.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
82
+
83
+ # "General English" PCFG binary
84
+
85
+ ( echo "Running englishPCFG (from treebank) on $host server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -saveToSerializedFile englishPCFG.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
86
+
87
+
88
+ # "General English" PCFG, case insensitive, binary
89
+
90
+ ( echo "Running caseless englishPCFG (from treebank) on $host server" ; time java -mx4g edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.LowercaseAndAmericanizeFunction -evals factDA,tsv -goodPCFG -saveToSerializedFile englishPCFG.caseless.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
91
+
92
+
93
+ # English WSJ 2-21 PCFG simplified grammar
94
+ # This dumbed down parser is used by the RNN parser.
95
+ # See /scr/nlp/data/dvparser for more details.
96
+ ( echo "Running wsj pcfg (simplified for use in the RNN parser) on $host -server" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -goodPCFG -noRightRec -dominatesV 0 -baseNP 0 -saveToSerializedFile wsjPCFG.nocompact.simple.ser.gz -maxLength 40 -compactGrammar 0 -train /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219 ) >>& ./serializedParsers.log
97
+
98
+ # English with extras PCFG simplified grammar
99
+ # This dumbed down parser is used by the RNN parser.
100
+ # See /scr/nlp/data/dvparser for more details.
101
+ ( echo "Running english pcfg (simplified for use in the RNN parser) on $host -server" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -noRightRec -dominatesV 0 -baseNP 0 -saveToSerializedFile englishPCFG.nocompact.simple.ser.gz -maxLength 40 -compactGrammar 0 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219 ) >>& ./serializedParsers.log
102
+
103
+
104
+ # Xinhua Mainland Chinese PCFG binary
105
+
106
+ ( echo "Running xinhuaPCFG on $host -server" ; time java -server -mx800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chinesePCFG -saveToSerializedFile xinhuaPCFG.ser.gz -maxLength 40 -train $ctb 026-270,301-499,600-999 -test $ctb 001-025 ) >>& ./serializedParsers.log
107
+ # new train list (Galen and Huihsin): 026-270,301-499,555-589,597-1041
108
+ # newer train list (Galen and Huihsin): 026-270,301-499,600-999
109
+ # this is all Xinhua minus Stanford devel and Bikel test
110
+
111
+ # Xinhua Mainland Chinese Factored binary
112
+
113
+ ( echo "Running xinhuaFactored on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -acl03chinese -scTags -saveToSerializedFile xinhuaFactored.ser.gz -maxLength 40 -train $ctb 026-270,301-499,600-999 -test $ctb 001-025 ) >>& ./serializedParsers.log
114
+
115
+ # Mixed dialect Chinese on lots of data (with chineseFactored)
116
+
117
+ ( echo "Running chineseFactored on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chineseFactored -saveToSerializedFile chineseFactored.ser.gz -maxLength 40 -train $ctb7train -test $ctb7test ) >>& ./serializedParsers.log
118
+ # new train list (Galen and Huihsin): 026-270,301-499,555-589,597-1041
119
+ # newer train list (Galen and Huihsin): 026-270,301-499,600-999
120
+ # this is all Xinhua minus Stanford devel and Bikel test
121
+ # CTB files 001-499, 555-589,597-1000 are from newswire of
122
+ # XinHua.
123
+ # Files 500-554 are Information Services Department of HKSAR.
124
+ # Files 590-596 and 1001-1151 are Sinorama articles, more of literature
125
+ # nature and from Taiwan.
126
+ # Files 2000-3145 are ACE broadcast news (from where?). We only use a few for now.
127
+
128
+ # Mixed dialect Chinese PCFG on lots of data
129
+
130
+ ( echo "Running chinesePCFG on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chinesePCFG -useUnicodeType -saveToSerializedFile chinesePCFG.ser.gz -maxLength 40 -train $ctb7train -test $ctb7test ) >>& ./serializedParsers.log
131
+ # new train list (Galen and Huihsin): 026-270,301-499,555-589,597-1041
132
+ # newer train list (Galen and Huihsin): 026-270,301-499,600-999
133
+ # this is all Xinhua minus Stanford devel and Bikel test
134
+
135
+
136
+ # Chinese parser for unsegmented Chinese
137
+
138
+ ( echo "Running xinhuaFactoredSegmenting on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -segmentMarkov -train $ctb 26-270,301-499,600-999 -sctags -acl03chinese -saveToSerializedFile xinhuaFactoredSegmenting.ser.gz ) >>& ./serializedParsers.log
139
+ java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -encoding utf-8 xinhuaFactoredSegmenting.ser.gz /u/nlp/data/lexparser/chinese-onesent-unseg-utf8.txt >>& ./serializedParsers.log
140
+
141
+
142
+ # It used to be the case that explicitly saying tLPP on command line was
143
+ # needed for file encoding. But it has been fixed.
144
+ # ( echo "Running xinhuaFactored from serialized check on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -maxLength 40 -loadFromSerializedFile xinhuaFactored.ser.gz -test $ctb 001-025 ) >>& ./serializedParsers.log
145
+ # This now works
146
+ ( echo "Running xinhuaFactored from serialized (check without specifying -tLPP) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -maxLength 40 -loadFromSerializedFile xinhuaFactored.ser.gz -test $ctb 001-025 ) >>& ./serializedParsers.log
147
+
148
+ ( echo "Running chinesePCFG (simplified for use in the RNN parser) on $host -server" ; time java -server -mx4g edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chineseFactored -PCFG -compactGrammar 0 -saveToSerializedFile chinesePCFG-simple.ser.gz -maxLength 40 -train $ctb7train -test $ctb7test ) >>& ./serializedParsers.log
149
+
150
+ # German Factored binary from Negra (version 2)
151
+ # $negra 3 is the dev set
152
+
153
+ ( echo "Running germanFactored on $host -server" ; time java -server -mx5g edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 2 -maxLength 40 -nodeCleanup 2 -saveToSerializedFile germanFactored.ser.gz -train $negra 1 -test $negra 3 ) >>& ./serializedParsers.log
154
+
155
+ # German PCFG from Negra (version 2)
156
+
157
+ ( echo "Running germanPCFG on $host -server" ; time java -server -mx2g edu.stanford.nlp.parser.lexparser.LexicalizedParser -v -evals tsv -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -PCFG -hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 1 -maxLength 40 -nodeCleanup 2 -saveToSerializedFile germanPCFG.ser.gz -train $negra 1 -test $negra 3 ) >>& ./serializedParsers.log
158
+
159
+ # German Dependency parser
160
+ # This requires normalizing the dependency output to strip boundary symbol.
161
+ # ( echo "Running germanDep on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -dep -hMarkov 1 -maxLength 40 -saveToSerializedFile germanDep.ser.gz -train $negra 1 -test $negra 3 ) >>& ./serializedParsers.log
162
+
163
+
164
+
165
+ ########
166
+ # The languages below this line use TreebankPreprocessor for pre-processing prior to training
167
+ ########
168
+ set mydir=`pwd`
169
+ set data_dir=/u/nlp/data/lexparser/trees
170
+ set tree_pipe=$JAVANLP_HOME/projects/core/scripts/run-tb-preproc
171
+ set train_sh=$JAVANLP_HOME/projects/core/scripts/lexparser-lang-train-test.sh
172
+
173
+ if( ! -e $data_dir ) then
174
+ mkdir $data_dir
175
+ endif
176
+
177
+ ########
178
+ # ARABIC
179
+ ########
180
+ set ar_data_dir=$data_dir/Arabic
181
+ set ar_conf_file=$JAVANLP_HOME/projects/core/src/edu/stanford/nlp/international/arabic/pipeline/configurations/atb-latest.conf
182
+ set ar_train_args="Arabic 40 $ar_data_dir/2-Unvoc-All.utf8.txt $ar_data_dir/2-Unvoc-Dev.utf8.txt BASELINE_ar -saveToSerializedFile arabicFactored.ser.gz"
183
+
184
+ if( ! -e $ar_data_dir ) then
185
+ mkdir $ar_data_dir
186
+ endif
187
+
188
+ echo Running $tree_pipe -p $ar_data_dir -v $ar_conf_file >>& ./serializedParsers.log
189
+ $tree_pipe -p $ar_data_dir -v $ar_conf_file >& $ar_data_dir/build.log
190
+
191
+ echo "" >>& ./serializedParsers.log
192
+ ( echo "Training Arabic Factored grammar using baseline feature set" ; time $train_sh $ar_train_args ) >>& ./serializedParsers.log
193
+
194
+
195
+ ########
196
+ # FRENCH
197
+ ########
198
+ set fr_data_dir=$data_dir/French
199
+ set fr_conf_file=$JAVANLP_HOME/projects/core/src/edu/stanford/nlp/international/french/pipeline/configurations/ftb-latest.conf
200
+ set fr_train_args="French 40 $fr_data_dir/FTB-All.utf8.txt $fr_data_dir/FTB-Dev.utf8.txt BASELINE_fr -saveToSerializedFile frenchFactored.ser.gz"
201
+
202
+ if( ! -e $fr_data_dir ) then
203
+ mkdir $fr_data_dir
204
+ endif
205
+
206
+ echo Running $tree_pipe -p $fr_data_dir -v $fr_conf_file >>& ./serializedParsers.log
207
+ $tree_pipe -p $fr_data_dir -v $fr_conf_file >& $fr_data_dir/build.log
208
+
209
+ echo "" >>& ./serializedParsers.log
210
+ echo time $train_sh $fr_train_args >>& ./serializedParsers.log
211
+ ( echo "Training French Factored grammar using baseline feature set" ; time $train_sh $fr_train_args ) >>& ./serializedParsers.log
212
+
213
+
214
+
215
+
216
+ ## English just to check parser code regression (not saved)
217
+
218
+ ## Just for reference
219
+ ( echo "Running wsjPCFG (acl03pcfg replication) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -acl03pcfg -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
220
+
221
+ ## See if same results from serialized parser
222
+ ( echo "Running wsjFactored (ijcai03 from serialized) on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -v -maxLength 40 -loadFromSerializedFile wsjFactored.ser.gz -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
223
+ # ( echo "Running wsjFactored (ijcai03 with nodeprune) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -ijcai03 -v -compactGrammar 0 -nodePrune true -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
224
+
225
+ ## See if same results from text grammar parser
226
+ ( echo "Running wsjFactored (ijcai03 from textGrammar) on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -v -maxLength 40 -loadFromTextFile wsjFactored.txt -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
227
+
228
+ uptime >> serializedParsers.log
229
+
230
+ mv -f serializedParsersPerformance.last serializedParsersPerformance.2ndlast
231
+ mv -f serializedParsersPerformance.current serializedParsersPerformance.last
232
+ echo -n "Parser run by $USER on " > serializedParsersPerformance.current
233
+ date >> serializedParsersPerformance.current
234
+ grep 'N: 253\|N: 393\|Done testing on treebank\|Running \| summary ' serializedParsers.log >> serializedParsersPerformance.current
235
+ echo >> serializedParsersPerformance.current
236
+ echo >> serializedParsersPerformance.current
237
+
238
+ cat serializedParsersPerformance.current >> serializedParsersPerformance.txt
239
+
240
+ cp -f serializedParsers.last serializedParsers.2ndlast
241
+ cp -f serializedParsers.current serializedParsers.last
242
+ cp -f serializedParsers.log serializedParsers.current
lng/L2SCA/stanford-parser-full-2014-01-04/bin/run-tb-preproc ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ #
3
+ # Convenience script for running
4
+ # edu.stanford.nlp.trees.treebank.TreebankPreprocessor.
5
+ #
6
+ # This package automatically generates the Arabic and French
7
+ # parser training data from the respective source distributions.
8
+ #
9
+ # See the README for more details.
10
+ #
11
+ # author: Spence Green
12
+ ##############################
13
+
14
+ import sys
15
+ from optparse import OptionParser
16
+ import os
17
+ import subprocess
18
+ from time import sleep
19
+
20
+ def run_treebank_pipeline(opts,conf_file):
21
+ cmd_line = 'java -Xmx%s -Xms%s edu.stanford.nlp.trees.treebank.TreebankPreprocessor' % (opts.jmem,opts.jmem)
22
+
23
+ if opts.verbose:
24
+ cmd_line = cmd_line + ' -v'
25
+
26
+ if opts.extra:
27
+ cmd_line = cmd_line + ' ' + opts.extra
28
+
29
+ if opts.output_path:
30
+ cmd_line = cmd_line + ' -p ' + opts.output_path
31
+
32
+ cmd_line = cmd_line + ' ' + conf_file
33
+
34
+ p = call_command(cmd_line)
35
+
36
+ while p.poll() == None:
37
+ out_str = p.stdout.readline()
38
+ if out_str != '':
39
+ print out_str[:-1]
40
+
41
+ # TODO: this will not handle spaces in the input or output paths
42
+ def call_command(command):
43
+ process = subprocess.Popen(command.split(' '), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
44
+ return process
45
+
46
+ def main():
47
+ usage = 'usage: %prog [opts] conf_file'
48
+ parser = OptionParser(usage=usage)
49
+ parser.add_option('-m','--java-mem',dest='jmem',default='500m',help='Set JVM memory heap size (e.g. 500m)')
50
+ parser.add_option('-v','--verbose',dest='verbose',action='store_true',default=False,help='Verbose mode')
51
+ parser.add_option('-o','--options',dest='extra',help='Pass options directly to TreebankPreprocessor')
52
+ parser.add_option('-p','--output-path',dest='output_path',help="Destination directory for the output")
53
+
54
+ (opts,args) = parser.parse_args()
55
+
56
+ if len(args) != 1:
57
+ parser.print_help()
58
+ sys.exit(-1)
59
+
60
+ conf_file = args[0]
61
+
62
+ run_treebank_pipeline(opts,conf_file)
63
+
64
+ if __name__ == '__main__':
65
+ main()
lng/L2SCA/stanford-parser-full-2014-01-04/build.xml ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- build.xml file for ant for JavaNLP -->
2
+
3
+ <!-- A "project" describes a set of targets that may be requested
4
+ when Ant is executed. The "default" attribute defines the
5
+ target which is executed if no specific target is requested,
6
+ and the "basedir" attribute defines the current working directory
7
+ from which Ant executes the requested task. This is normally
8
+ set to the current working directory.
9
+ -->
10
+
11
+ <project name="JavaNLP" default="compile" basedir=".">
12
+
13
+ <property name="build.home" value="${basedir}/classes"/>
14
+ <property name="build.tests" value="${basedir}/classes"/>
15
+ <property name="docs.home" value="${basedir}/docs"/>
16
+ <property name="src.home" value="${basedir}/src"/>
17
+ <property name="javadoc.home" value="${basedir}/javadoc"/>
18
+
19
+
20
+ <!-- ==================== Compilation Control Options ==================== -->
21
+
22
+ <!--
23
+
24
+ These properties control option settings on the Javac compiler when it
25
+ is invoked using the <javac> task.
26
+
27
+ compile.debug Should compilation include the debug option?
28
+
29
+ compile.deprecation Should compilation include the deprecation option?
30
+
31
+ compile.optimize Should compilation include the optimize option?
32
+
33
+ compile.source Source version compatibility
34
+
35
+ compile.target Target class version compatibility
36
+
37
+ -->
38
+
39
+ <property name="compile.debug" value="true"/>
40
+ <property name="compile.deprecation" value="false"/>
41
+ <property name="compile.optimize" value="true"/>
42
+ <property name="compile.source" value="1.6" />
43
+ <property name="compile.target" value="1.6" />
44
+
45
+
46
+
47
+
48
+ <!-- ==================== All Target ====================================== -->
49
+
50
+ <!--
51
+
52
+ The "all" target is a shortcut for running the "clean" target followed
53
+ by the "compile" target, to force a complete recompile.
54
+
55
+ -->
56
+
57
+ <target name="all" depends="clean,compile"
58
+ description="Clean build and dist directories, then compile"/>
59
+
60
+
61
+
62
+ <!-- ==================== Clean Target ==================================== -->
63
+
64
+ <!--
65
+
66
+ The "clean" target deletes any previous "build" and "dist" directory,
67
+ so that you can be ensured the application can be built from scratch.
68
+
69
+ -->
70
+
71
+ <target name="clean" description="Delete old classes">
72
+ <delete dir="${build.home}/edu"/>
73
+ </target>
74
+
75
+
76
+ <!-- ==================== Classpath Targets ==================================== -->
77
+
78
+ <!--
79
+
80
+ Sets the classpath for this project properly. We now always use the
81
+ lib dir within javanlp.
82
+
83
+ -->
84
+
85
+ <target name="classpath" description="Sets the classpath">
86
+ <path id="compile.classpath">
87
+ <fileset dir="${basedir}">
88
+ <include name="*.jar"/>
89
+ <exclude name="stanford-parser*"/>
90
+ </fileset>
91
+ </path>
92
+ </target>
93
+
94
+
95
+
96
+
97
+
98
+ <!-- ==================== Compile Target ================================== -->
99
+
100
+ <!--
101
+
102
+ The "compile" target transforms source files (from your "src" directory)
103
+ into object files in the appropriate location in the build directory.
104
+ This example assumes that you will be including your classes in an
105
+ unpacked directory hierarchy under "/WEB-INF/classes".
106
+
107
+ -->
108
+
109
+ <target name="compile" depends="prepare,classpath"
110
+ description="Compile Java sources">
111
+
112
+ <!-- Compile Java classes as necessary -->
113
+ <mkdir dir="${build.home}"/>
114
+ <javac srcdir="${src.home}"
115
+ destdir="${build.home}"
116
+ debug="${compile.debug}"
117
+ encoding="utf-8"
118
+ deprecation="${compile.deprecation}"
119
+ optimize="${compile.optimize}"
120
+ source="${compile.source}"
121
+ target="${compile.target}"
122
+ includeantruntime="false">
123
+ <classpath refid="compile.classpath"/>
124
+ <compilerarg value="-Xmaxerrs"/>
125
+ <compilerarg value="20"/>
126
+ <!-- <compilerarg value="-Xlint"/> -->
127
+ </javac>
128
+
129
+ <!-- Copy application resources -->
130
+ <!--
131
+ <copy todir="${build.home}/WEB-INF/classes">
132
+ <fileset dir="${src.home}" excludes="**/*.java"/>
133
+ </copy>
134
+ -->
135
+
136
+ </target>
137
+
138
+
139
+ <!-- ==================== Javadoc Target ================================== -->
140
+
141
+ <!--
142
+
143
+ The "javadoc" target creates Javadoc API documentation for the Java
144
+ classes included in your application. Normally, this is only required
145
+ when preparing a distribution release, but is available as a separate
146
+ target in case the developer wants to create Javadocs independently.
147
+
148
+ -->
149
+
150
+ <target name="javadoc" depends="compile"
151
+ description="Create Javadoc API documentation">
152
+
153
+ <mkdir dir="${javadoc.home}"/>
154
+ <javadoc sourcepath="${src.home}"
155
+ destdir="${javadoc.home}"
156
+ maxmemory="768m"
157
+ author="true"
158
+ source="1.6"
159
+ Overview="${src.home}/edu/stanford/nlp/overview.html"
160
+ Doctitle="Stanford JavaNLP API Documentation"
161
+ Windowtitle="Stanford JavaNLP API"
162
+ packagenames="*">
163
+ <bottom><![CDATA[<FONT SIZE=2><A HREF=\"http://nlp.stanford.edu\">Stanford NLP Group</A></FONT>]]></bottom>
164
+ <link href="http://java.sun.com/j2se/1.6.0/docs/api/"/>
165
+ </javadoc>
166
+
167
+ </target>
168
+
169
+
170
+ <!-- ==================== Prepare Target ================================== -->
171
+
172
+ <!--
173
+
174
+ The "prepare" target is used to create the "build" destination directory,
175
+ and copy the static contents of your web application to it. If you need
176
+ to copy static files from external dependencies, you can customize the
177
+ contents of this task.
178
+
179
+ Normally, this task is executed indirectly when needed.
180
+
181
+ -->
182
+
183
+ <target name="prepare">
184
+
185
+ <!-- Create build directories as needed -->
186
+ <mkdir dir="${build.home}"/>
187
+
188
+ </target>
189
+
190
+ </project>
lng/L2SCA/stanford-parser-full-2014-01-04/conf/atb-latest.conf ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###########################
2
+ # Baseline ATB Newswire Datasets
3
+ #
4
+ # This file creates the three data sets used in the current
5
+ # line of Arabic parsing research:
6
+ #
7
+ # (1) Raw (no Bies mapping) / Unvocalized ("Raw")
8
+ # (2) Bies + DT / Unvocalized ("Unvoc")
9
+ # (3) Bies + DT / Vocalized ("Voc")
10
+ # (4) Bies + DT / Unvocalized ("NoDashTags")
11
+ # -No traces or phrasal tag decorations. For training the Berkeley parser.
12
+ #
13
+ # Note that "Bies + DT" refers to the enhancement to the Bies mappings
14
+ # proposed by Kulick et al. (2006).
15
+ #
16
+ # The training/dev/test set is the "Mona Diab split" from the 2005 JHU
17
+ # workshop on parsing Arabic dialects (Chiang et al., 2006).
18
+ #
19
+ #
20
+ # IMPORTANT: All paths should reference the base Arabic data directory
21
+ #
22
+ # /u/nlp/data/Arabic
23
+ #
24
+ ###########################
25
+
26
+ NAME=1 Raw Train
27
+ TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
28
+ PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
29
+ PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
30
+ PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
31
+ SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train
32
+ OUTPUT_ENCODING=UTF8
33
+ FLAT=true
34
+
35
+ ;;
36
+
37
+ NAME=1 Raw Dev
38
+ TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
39
+ PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
40
+ PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
41
+ PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
42
+ SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev
43
+ OUTPUT_ENCODING=UTF8
44
+ FLAT=true
45
+
46
+ ;;
47
+
48
+ NAME=1 Raw Test
49
+ TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
50
+ PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
51
+ PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
52
+ PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
53
+ SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test
54
+ OUTPUT_ENCODING=UTF8
55
+ FLAT=true
56
+
57
+ ;;
58
+
59
+ NAME=2 Unvoc All
60
+ TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
61
+ PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
62
+ PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
63
+ PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
64
+ OUTPUT_ENCODING=UTF8
65
+ MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
66
+ MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
67
+ MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
68
+ USEDET=true
69
+
70
+ ;;
71
+
72
+ NAME=2 Unvoc Train
73
+ TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
74
+ PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
75
+ PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
76
+ PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
77
+ SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train
78
+ OUTPUT_ENCODING=UTF8
79
+ MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
80
+ MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
81
+ MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
82
+ USEDET=true
83
+
84
+ ;;
85
+
86
+ NAME=2 Unvoc Dev
87
+ TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
88
+ PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
89
+ PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
90
+ PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
91
+ SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev
92
+ OUTPUT_ENCODING=UTF8
93
+ MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
94
+ MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
95
+ MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
96
+ USEDET=true
97
+
98
+ ;;
99
+
100
+ NAME=2 Unvoc Test
101
+ TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
102
+ PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
103
+ PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
104
+ PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
105
+ SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test
106
+ OUTPUT_ENCODING=UTF8
107
+ MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
108
+ MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
109
+ MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
110
+ USEDET=true
111
+
112
+ ;;
113
+
114
+ NAME=3 Voc Train
115
+ TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
116
+ PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/with-vowel
117
+ PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/with-vowel
118
+ PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/with-vowel
119
+ SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train
120
+ OUTPUT_ENCODING=UTF8
121
+ MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
122
+ MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
123
+ MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
124
+ USEDET=true
125
+ LEXMAPPER=edu.stanford.nlp.international.arabic.pipeline.UnvocLexicalMapper
126
+ FLAT=true
127
+
128
+ ;;
129
+
130
+ NAME=3 Voc Dev
131
+ TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
132
+ PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/with-vowel
133
+ PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/with-vowel
134
+ PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/with-vowel
135
+ SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev
136
+ OUTPUT_ENCODING=UTF8
137
+ MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
138
+ MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
139
+ MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
140
+ USEDET=true
141
+ LEXMAPPER=edu.stanford.nlp.international.arabic.pipeline.UnvocLexicalMapper
142
+ FLAT=true
143
+
144
+ ;;
145
+
146
+ NAME=3 Voc Test
147
+ TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
148
+ PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/with-vowel
149
+ PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/with-vowel
150
+ PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/with-vowel
151
+ SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test
152
+ OUTPUT_ENCODING=UTF8
153
+ MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
154
+ MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
155
+ MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
156
+ USEDET=true
157
+ LEXMAPPER=edu.stanford.nlp.international.arabic.pipeline.UnvocLexicalMapper
158
+ FLAT=true
159
+
160
+ ;;
161
+
162
+
163
+ NAME=4 Unvoc Train NoDashTags
164
+ TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
165
+ PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
166
+ PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
167
+ PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
168
+ SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train
169
+ OUTPUT_ENCODING=UTF8
170
+ MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
171
+ MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
172
+ MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
173
+ NODASHTAGS=true
174
+ ADDROOT=true
175
+ USEDET=true
176
+
177
+ ;;
178
+
179
+ NAME=4 Unvoc Dev NoDashTags
180
+ TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
181
+ PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
182
+ PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
183
+ PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
184
+ SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev
185
+ OUTPUT_ENCODING=UTF8
186
+ MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
187
+ MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
188
+ MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
189
+ NODASHTAGS=true
190
+ ADDROOT=true
191
+ USEDET=true
192
+
193
+ ;;
194
+
195
+ NAME=4 Unvoc Test NoDashTags
196
+ TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
197
+ PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
198
+ PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
199
+ PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
200
+ SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test
201
+ OUTPUT_ENCODING=UTF8
202
+ MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
203
+ MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
204
+ MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
205
+ NODASHTAGS=true
206
+ ADDROOT=true
207
+ USEDET=true
208
+
209
+ ;;
lng/L2SCA/stanford-parser-full-2014-01-04/conf/ftb-latest.conf ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###########################
2
+ # Baseline FTB Datasets
3
+ #
4
+ # IMPORTANT: All paths should reference the base Arabic data directory
5
+ #
6
+ # /u/nlp/data/FrenchTreebank/versionJune2010
7
+ #
8
+ ###########################
9
+
10
+ NAME=FTB All
11
+ TYPE=edu.stanford.nlp.international.french.pipeline.FTBDataset
12
+ PATH=/u/nlp/data/FrenchTreebank/versionJune2010/corpus-fonctions
13
+ OUTPUT_ENCODING=UTF8
14
+ TVISITOR=edu.stanford.nlp.international.french.pipeline.FTBCorrectorVisitor
15
+ FLAT=true
16
+
17
+ ;;
18
+
19
+ NAME=FTB Train
20
+ TYPE=edu.stanford.nlp.international.french.pipeline.FTBDataset
21
+ PATH=/u/nlp/data/FrenchTreebank/versionJune2010/corpus-fonctions
22
+ SPLIT=$JAVANLP_HOME/projects/core/src/edu/stanford/nlp/international/french/pipeline/splits/candito.train
23
+ OUTPUT_ENCODING=UTF8
24
+ TVISITOR=edu.stanford.nlp.international.french.pipeline.FTBCorrectorVisitor
25
+
26
+ ;;
27
+
28
+ NAME=FTB Dev
29
+ TYPE=edu.stanford.nlp.international.french.pipeline.FTBDataset
30
+ PATH=/u/nlp/data/FrenchTreebank/versionJune2010/corpus-fonctions
31
+ SPLIT=$JAVANLP_HOME/projects/core/src/edu/stanford/nlp/international/french/pipeline/splits/candito.dev
32
+ OUTPUT_ENCODING=UTF8
33
+ TVISITOR=edu.stanford.nlp.international.french.pipeline.FTBCorrectorVisitor
34
+
35
+ ;;
36
+
37
+ NAME=FTB Test
38
+ TYPE=edu.stanford.nlp.international.french.pipeline.FTBDataset
39
+ PATH=/u/nlp/data/FrenchTreebank/versionJune2010/corpus-fonctions
40
+ SPLIT=$JAVANLP_HOME/projects/core/src/edu/stanford/nlp/international/french/pipeline/splits/candito.test
41
+ OUTPUT_ENCODING=UTF8
42
+ TVISITOR=edu.stanford.nlp.international.french.pipeline.FTBCorrectorVisitor
43
+
44
+ ;;
lng/L2SCA/stanford-parser-full-2014-01-04/data/arabic-onesent-utf8.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ و نشر العدل من خلال قضاء مستقل .
lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-gb18030.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ���� ϣ�� ���� û�� ���� ������ �ƻ� ��
lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-unseg-gb18030.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ����Ժ��ǰ��������֪ͨ��Ҫ�������ʵ��ʵ��֤�г���Ӧ�ĸ������ߣ�ά����ʳƷ�۸��ȶ���
lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-unseg-utf8.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 国务院日前发出紧急通知,要求各地切实落实保证市场供应的各项政策,维护副食品价格稳定。
lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-utf8.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 俄国 希望 伊朗 没有 制造 核武器 计划 。
lng/L2SCA/stanford-parser-full-2014-01-04/data/english-onesent.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ The quick brown fox jumped over the lazy dog.
lng/L2SCA/stanford-parser-full-2014-01-04/data/french-onesent.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Le gouvernement se résout donc à renvoyer la balle dans le camp de partenaires qui ont amplement fait la preuve de leur incapacité à gérer le système de santé .
lng/L2SCA/stanford-parser-full-2014-01-04/data/german-onesent.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Wir haben nichts zu tun .
lng/L2SCA/stanford-parser-full-2014-01-04/data/pos-sentences.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ People can butter their bread with a knife .
2
+ People can butter/VB their bread with a knife .
3
+ People can butter/NN their bread with a knife .
4
+ People/NNS can/MD butter/VB their/PRP$ bread/NN with/IN a/DT knife/NN ./.
5
+ People/NNS can/VB butter/NN their/PRP$ bread/NN with/IN a/DT knife/NN ./.
6
+ People/NNS can/NN butter/NN their/PRP$ bread/NN with/IN a/DT knife/NN ./.
7
+ People/NN can/NN butter/NN their/NN bread/NN with/NN a/NN knife/NN ./NN
lng/L2SCA/stanford-parser-full-2014-01-04/data/testsent.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Scores of properties are under extreme fire threat as a huge blaze
2
+ continues to advance through Sydney's north-western suburbs. Fires
3
+ have also shut down the major road and rail links between Sydney and
4
+ Gosford.
5
+
6
+ The promotional stop in Sydney was everything to be expected for a
7
+ Hollywood blockbuster - phalanxes of photographers, a stretch limo to
8
+ a hotel across the Quay - but with one difference. A line-up of
9
+ masseurs was waiting to take the media in hand. Never has the term
10
+ "massaging the media" seemed so accurate.
lng/L2SCA/stanford-parser-full-2014-01-04/ejml-0.23.jar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0250933fe8cc6a44eb098016d4dadaba7746a27efc3d5a7f4f4c9bf247cfe09
3
+ size 211938
lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-gui.bat ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ :: runs the parser GUI
2
+ :: usage lexparser-gui [parserDataFilename [textFileName]]
3
+ java -mx800m -cp "*" edu.stanford.nlp.parser.ui.Parser
lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-gui.command ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Runs the Lexicalized Parser GUI. You can just run this script and then
4
+ # load a grammar and file to be parsed from the menus or you can specify
5
+ # them on the command line.
6
+ #
7
+ # Usage: ./lexparser-gui.sh [parserDataFilename [textFileName]]
8
+ #
9
+
10
+
11
+ scriptdir=`dirname $0`
12
+
13
+ java -mx800m -cp "$scriptdir/*" edu.stanford.nlp.parser.ui.Parser $*
lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-gui.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Runs the Lexicalized Parser GUI. You can just run this script and then
4
+ # load a grammar and file to be parsed from the menus or you can specify
5
+ # them on the command line.
6
+ #
7
+ # Usage: ./lexparser-gui.sh [parserDataFilename [textFileName]]
8
+ #
9
+
10
+
11
+ scriptdir=`dirname $0`
12
+
13
+ java -mx800m -cp "$scriptdir/*" edu.stanford.nlp.parser.ui.Parser $*
lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-lang-train-test.sh ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Defines standard configurations for training and evaluating the
4
+ # multilingual parsers (Arabic, Chinese, German, French). You can
5
+ # also train and test the English parsers with this script.
6
+ #
7
+ # For details on the language-specific options, see the javadocs and
8
+ # lexparser_lang.def.
9
+ #
10
+
11
+ # Memory limit
12
+ mem=6g
13
+
14
+ if [ ! $# -ge 5 ]; then
15
+ echo Usage: `basename $0` lang len train_file test_file out_file features
16
+ echo
17
+ echo ' lang : Language to parse (Arabic, English, Chinese, German, French)'
18
+ echo ' len : Maximum length of the sentences to parse'
19
+ echo ' train_file : Training treebank file'
20
+ echo ' test_file : Test treebank file (for evaluation)'
21
+ echo ' out_file : Prefix for the output filename'
22
+ echo ' features : Variable length list of optional parser features'
23
+ echo
24
+ echo 'Parser memory limit is currently:' "$mem"
25
+ echo
26
+ exit
27
+ fi
28
+
29
+ # Setup command-line options
30
+ lang=$1
31
+ len=$2
32
+ train_path=$3
33
+ test_file=$4
34
+ out_file=$5
35
+
36
+ shift 5
37
+
38
+ # Language-specific configuration
39
+ scriptdir=`dirname $0`
40
+ echo $JAVANLP_HOME
41
+ source $JAVANLP_HOME/projects/core/scripts/lexparser_lang.def
42
+
43
+ # Setting classpath
44
+ #CLASSPATH="$CLASSPATH":"$scriptdir/*"
45
+
46
+ # Run the Stanford parser
47
+ java -Xmx"$mem" -cp "$scriptdir/*:$CLASSPATH" edu.stanford.nlp.parser.lexparser.LexicalizedParser -maxLength "$len" \
48
+ -tLPP "$tlp" $lang_opts $* -writeOutputFiles \
49
+ -outputFilesExtension "$out_file"."$len".stp -outputFormat "penn" \
50
+ -outputFormatOptions "removeTopBracket,includePunctuationDependencies" -train "$train_path" -test "$test_file"