Spaces:
Sleeping
Sleeping
mohdelgaar
commited on
Commit
•
b028d48
1
Parent(s):
674b430
upload lng
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- lng/L2SCA/LICENSE.txt +339 -0
- lng/L2SCA/Makefile +12 -0
- lng/L2SCA/README-L2SCA.txt +94 -0
- lng/L2SCA/README-gui.txt +206 -0
- lng/L2SCA/README-tregex.txt +429 -0
- lng/L2SCA/README-tsurgeon.txt +529 -0
- lng/L2SCA/Semgrex.ppt +0 -0
- lng/L2SCA/analyzeFolder.py +148 -0
- lng/L2SCA/analyzeText.py +146 -0
- lng/L2SCA/examples/atree +1 -0
- lng/L2SCA/examples/exciseNP +6 -0
- lng/L2SCA/examples/relabelWithGroupName +4 -0
- lng/L2SCA/examples/renameVerb +3 -0
- lng/L2SCA/lib/ABOUT-AppleJavaExtensions.txt +29 -0
- lng/L2SCA/lib/AppleJavaExtensions.jar +3 -0
- lng/L2SCA/lib/README-AppleJavaExtensions.txt +46 -0
- lng/L2SCA/run-tregex-gui.bat +1 -0
- lng/L2SCA/run-tregex-gui.command +2 -0
- lng/L2SCA/samples/my_sample.txt +1 -0
- lng/L2SCA/samples/sample1.txt +10 -0
- lng/L2SCA/samples/sample1_output +2 -0
- lng/L2SCA/samples/sample2.txt +1 -0
- lng/L2SCA/samples/samples_output +3 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/LICENSE.txt +339 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/Makefile +13 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/ParserDemo.java +100 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/ParserDemo2.java +88 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/README.txt +280 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/README_dependencies.txt +194 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/StanfordDependenciesManual.pdf +0 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/bin/makeSerialized.csh +242 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/bin/run-tb-preproc +65 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/build.xml +190 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/conf/atb-latest.conf +209 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/conf/ftb-latest.conf +44 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/data/arabic-onesent-utf8.txt +1 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-gb18030.txt +1 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-unseg-gb18030.txt +1 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-unseg-utf8.txt +1 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-utf8.txt +1 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/data/english-onesent.txt +1 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/data/french-onesent.txt +1 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/data/german-onesent.txt +1 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/data/pos-sentences.txt +7 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/data/testsent.txt +10 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/ejml-0.23.jar +3 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-gui.bat +3 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-gui.command +13 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-gui.sh +13 -0
- lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-lang-train-test.sh +50 -0
lng/L2SCA/LICENSE.txt
ADDED
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
GNU GENERAL PUBLIC LICENSE
|
2 |
+
Version 2, June 1991
|
3 |
+
|
4 |
+
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
|
5 |
+
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
6 |
+
Everyone is permitted to copy and distribute verbatim copies
|
7 |
+
of this license document, but changing it is not allowed.
|
8 |
+
|
9 |
+
Preamble
|
10 |
+
|
11 |
+
The licenses for most software are designed to take away your
|
12 |
+
freedom to share and change it. By contrast, the GNU General Public
|
13 |
+
License is intended to guarantee your freedom to share and change free
|
14 |
+
software--to make sure the software is free for all its users. This
|
15 |
+
General Public License applies to most of the Free Software
|
16 |
+
Foundation's software and to any other program whose authors commit to
|
17 |
+
using it. (Some other Free Software Foundation software is covered by
|
18 |
+
the GNU Lesser General Public License instead.) You can apply it to
|
19 |
+
your programs, too.
|
20 |
+
|
21 |
+
When we speak of free software, we are referring to freedom, not
|
22 |
+
price. Our General Public Licenses are designed to make sure that you
|
23 |
+
have the freedom to distribute copies of free software (and charge for
|
24 |
+
this service if you wish), that you receive source code or can get it
|
25 |
+
if you want it, that you can change the software or use pieces of it
|
26 |
+
in new free programs; and that you know you can do these things.
|
27 |
+
|
28 |
+
To protect your rights, we need to make restrictions that forbid
|
29 |
+
anyone to deny you these rights or to ask you to surrender the rights.
|
30 |
+
These restrictions translate to certain responsibilities for you if you
|
31 |
+
distribute copies of the software, or if you modify it.
|
32 |
+
|
33 |
+
For example, if you distribute copies of such a program, whether
|
34 |
+
gratis or for a fee, you must give the recipients all the rights that
|
35 |
+
you have. You must make sure that they, too, receive or can get the
|
36 |
+
source code. And you must show them these terms so they know their
|
37 |
+
rights.
|
38 |
+
|
39 |
+
We protect your rights with two steps: (1) copyright the software, and
|
40 |
+
(2) offer you this license which gives you legal permission to copy,
|
41 |
+
distribute and/or modify the software.
|
42 |
+
|
43 |
+
Also, for each author's protection and ours, we want to make certain
|
44 |
+
that everyone understands that there is no warranty for this free
|
45 |
+
software. If the software is modified by someone else and passed on, we
|
46 |
+
want its recipients to know that what they have is not the original, so
|
47 |
+
that any problems introduced by others will not reflect on the original
|
48 |
+
authors' reputations.
|
49 |
+
|
50 |
+
Finally, any free program is threatened constantly by software
|
51 |
+
patents. We wish to avoid the danger that redistributors of a free
|
52 |
+
program will individually obtain patent licenses, in effect making the
|
53 |
+
program proprietary. To prevent this, we have made it clear that any
|
54 |
+
patent must be licensed for everyone's free use or not licensed at all.
|
55 |
+
|
56 |
+
The precise terms and conditions for copying, distribution and
|
57 |
+
modification follow.
|
58 |
+
|
59 |
+
GNU GENERAL PUBLIC LICENSE
|
60 |
+
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
61 |
+
|
62 |
+
0. This License applies to any program or other work which contains
|
63 |
+
a notice placed by the copyright holder saying it may be distributed
|
64 |
+
under the terms of this General Public License. The "Program", below,
|
65 |
+
refers to any such program or work, and a "work based on the Program"
|
66 |
+
means either the Program or any derivative work under copyright law:
|
67 |
+
that is to say, a work containing the Program or a portion of it,
|
68 |
+
either verbatim or with modifications and/or translated into another
|
69 |
+
language. (Hereinafter, translation is included without limitation in
|
70 |
+
the term "modification".) Each licensee is addressed as "you".
|
71 |
+
|
72 |
+
Activities other than copying, distribution and modification are not
|
73 |
+
covered by this License; they are outside its scope. The act of
|
74 |
+
running the Program is not restricted, and the output from the Program
|
75 |
+
is covered only if its contents constitute a work based on the
|
76 |
+
Program (independent of having been made by running the Program).
|
77 |
+
Whether that is true depends on what the Program does.
|
78 |
+
|
79 |
+
1. You may copy and distribute verbatim copies of the Program's
|
80 |
+
source code as you receive it, in any medium, provided that you
|
81 |
+
conspicuously and appropriately publish on each copy an appropriate
|
82 |
+
copyright notice and disclaimer of warranty; keep intact all the
|
83 |
+
notices that refer to this License and to the absence of any warranty;
|
84 |
+
and give any other recipients of the Program a copy of this License
|
85 |
+
along with the Program.
|
86 |
+
|
87 |
+
You may charge a fee for the physical act of transferring a copy, and
|
88 |
+
you may at your option offer warranty protection in exchange for a fee.
|
89 |
+
|
90 |
+
2. You may modify your copy or copies of the Program or any portion
|
91 |
+
of it, thus forming a work based on the Program, and copy and
|
92 |
+
distribute such modifications or work under the terms of Section 1
|
93 |
+
above, provided that you also meet all of these conditions:
|
94 |
+
|
95 |
+
a) You must cause the modified files to carry prominent notices
|
96 |
+
stating that you changed the files and the date of any change.
|
97 |
+
|
98 |
+
b) You must cause any work that you distribute or publish, that in
|
99 |
+
whole or in part contains or is derived from the Program or any
|
100 |
+
part thereof, to be licensed as a whole at no charge to all third
|
101 |
+
parties under the terms of this License.
|
102 |
+
|
103 |
+
c) If the modified program normally reads commands interactively
|
104 |
+
when run, you must cause it, when started running for such
|
105 |
+
interactive use in the most ordinary way, to print or display an
|
106 |
+
announcement including an appropriate copyright notice and a
|
107 |
+
notice that there is no warranty (or else, saying that you provide
|
108 |
+
a warranty) and that users may redistribute the program under
|
109 |
+
these conditions, and telling the user how to view a copy of this
|
110 |
+
License. (Exception: if the Program itself is interactive but
|
111 |
+
does not normally print such an announcement, your work based on
|
112 |
+
the Program is not required to print an announcement.)
|
113 |
+
|
114 |
+
These requirements apply to the modified work as a whole. If
|
115 |
+
identifiable sections of that work are not derived from the Program,
|
116 |
+
and can be reasonably considered independent and separate works in
|
117 |
+
themselves, then this License, and its terms, do not apply to those
|
118 |
+
sections when you distribute them as separate works. But when you
|
119 |
+
distribute the same sections as part of a whole which is a work based
|
120 |
+
on the Program, the distribution of the whole must be on the terms of
|
121 |
+
this License, whose permissions for other licensees extend to the
|
122 |
+
entire whole, and thus to each and every part regardless of who wrote it.
|
123 |
+
|
124 |
+
Thus, it is not the intent of this section to claim rights or contest
|
125 |
+
your rights to work written entirely by you; rather, the intent is to
|
126 |
+
exercise the right to control the distribution of derivative or
|
127 |
+
collective works based on the Program.
|
128 |
+
|
129 |
+
In addition, mere aggregation of another work not based on the Program
|
130 |
+
with the Program (or with a work based on the Program) on a volume of
|
131 |
+
a storage or distribution medium does not bring the other work under
|
132 |
+
the scope of this License.
|
133 |
+
|
134 |
+
3. You may copy and distribute the Program (or a work based on it,
|
135 |
+
under Section 2) in object code or executable form under the terms of
|
136 |
+
Sections 1 and 2 above provided that you also do one of the following:
|
137 |
+
|
138 |
+
a) Accompany it with the complete corresponding machine-readable
|
139 |
+
source code, which must be distributed under the terms of Sections
|
140 |
+
1 and 2 above on a medium customarily used for software interchange; or,
|
141 |
+
|
142 |
+
b) Accompany it with a written offer, valid for at least three
|
143 |
+
years, to give any third party, for a charge no more than your
|
144 |
+
cost of physically performing source distribution, a complete
|
145 |
+
machine-readable copy of the corresponding source code, to be
|
146 |
+
distributed under the terms of Sections 1 and 2 above on a medium
|
147 |
+
customarily used for software interchange; or,
|
148 |
+
|
149 |
+
c) Accompany it with the information you received as to the offer
|
150 |
+
to distribute corresponding source code. (This alternative is
|
151 |
+
allowed only for noncommercial distribution and only if you
|
152 |
+
received the program in object code or executable form with such
|
153 |
+
an offer, in accord with Subsection b above.)
|
154 |
+
|
155 |
+
The source code for a work means the preferred form of the work for
|
156 |
+
making modifications to it. For an executable work, complete source
|
157 |
+
code means all the source code for all modules it contains, plus any
|
158 |
+
associated interface definition files, plus the scripts used to
|
159 |
+
control compilation and installation of the executable. However, as a
|
160 |
+
special exception, the source code distributed need not include
|
161 |
+
anything that is normally distributed (in either source or binary
|
162 |
+
form) with the major components (compiler, kernel, and so on) of the
|
163 |
+
operating system on which the executable runs, unless that component
|
164 |
+
itself accompanies the executable.
|
165 |
+
|
166 |
+
If distribution of executable or object code is made by offering
|
167 |
+
access to copy from a designated place, then offering equivalent
|
168 |
+
access to copy the source code from the same place counts as
|
169 |
+
distribution of the source code, even though third parties are not
|
170 |
+
compelled to copy the source along with the object code.
|
171 |
+
|
172 |
+
4. You may not copy, modify, sublicense, or distribute the Program
|
173 |
+
except as expressly provided under this License. Any attempt
|
174 |
+
otherwise to copy, modify, sublicense or distribute the Program is
|
175 |
+
void, and will automatically terminate your rights under this License.
|
176 |
+
However, parties who have received copies, or rights, from you under
|
177 |
+
this License will not have their licenses terminated so long as such
|
178 |
+
parties remain in full compliance.
|
179 |
+
|
180 |
+
5. You are not required to accept this License, since you have not
|
181 |
+
signed it. However, nothing else grants you permission to modify or
|
182 |
+
distribute the Program or its derivative works. These actions are
|
183 |
+
prohibited by law if you do not accept this License. Therefore, by
|
184 |
+
modifying or distributing the Program (or any work based on the
|
185 |
+
Program), you indicate your acceptance of this License to do so, and
|
186 |
+
all its terms and conditions for copying, distributing or modifying
|
187 |
+
the Program or works based on it.
|
188 |
+
|
189 |
+
6. Each time you redistribute the Program (or any work based on the
|
190 |
+
Program), the recipient automatically receives a license from the
|
191 |
+
original licensor to copy, distribute or modify the Program subject to
|
192 |
+
these terms and conditions. You may not impose any further
|
193 |
+
restrictions on the recipients' exercise of the rights granted herein.
|
194 |
+
You are not responsible for enforcing compliance by third parties to
|
195 |
+
this License.
|
196 |
+
|
197 |
+
7. If, as a consequence of a court judgment or allegation of patent
|
198 |
+
infringement or for any other reason (not limited to patent issues),
|
199 |
+
conditions are imposed on you (whether by court order, agreement or
|
200 |
+
otherwise) that contradict the conditions of this License, they do not
|
201 |
+
excuse you from the conditions of this License. If you cannot
|
202 |
+
distribute so as to satisfy simultaneously your obligations under this
|
203 |
+
License and any other pertinent obligations, then as a consequence you
|
204 |
+
may not distribute the Program at all. For example, if a patent
|
205 |
+
license would not permit royalty-free redistribution of the Program by
|
206 |
+
all those who receive copies directly or indirectly through you, then
|
207 |
+
the only way you could satisfy both it and this License would be to
|
208 |
+
refrain entirely from distribution of the Program.
|
209 |
+
|
210 |
+
If any portion of this section is held invalid or unenforceable under
|
211 |
+
any particular circumstance, the balance of the section is intended to
|
212 |
+
apply and the section as a whole is intended to apply in other
|
213 |
+
circumstances.
|
214 |
+
|
215 |
+
It is not the purpose of this section to induce you to infringe any
|
216 |
+
patents or other property right claims or to contest validity of any
|
217 |
+
such claims; this section has the sole purpose of protecting the
|
218 |
+
integrity of the free software distribution system, which is
|
219 |
+
implemented by public license practices. Many people have made
|
220 |
+
generous contributions to the wide range of software distributed
|
221 |
+
through that system in reliance on consistent application of that
|
222 |
+
system; it is up to the author/donor to decide if he or she is willing
|
223 |
+
to distribute software through any other system and a licensee cannot
|
224 |
+
impose that choice.
|
225 |
+
|
226 |
+
This section is intended to make thoroughly clear what is believed to
|
227 |
+
be a consequence of the rest of this License.
|
228 |
+
|
229 |
+
8. If the distribution and/or use of the Program is restricted in
|
230 |
+
certain countries either by patents or by copyrighted interfaces, the
|
231 |
+
original copyright holder who places the Program under this License
|
232 |
+
may add an explicit geographical distribution limitation excluding
|
233 |
+
those countries, so that distribution is permitted only in or among
|
234 |
+
countries not thus excluded. In such case, this License incorporates
|
235 |
+
the limitation as if written in the body of this License.
|
236 |
+
|
237 |
+
9. The Free Software Foundation may publish revised and/or new versions
|
238 |
+
of the General Public License from time to time. Such new versions will
|
239 |
+
be similar in spirit to the present version, but may differ in detail to
|
240 |
+
address new problems or concerns.
|
241 |
+
|
242 |
+
Each version is given a distinguishing version number. If the Program
|
243 |
+
specifies a version number of this License which applies to it and "any
|
244 |
+
later version", you have the option of following the terms and conditions
|
245 |
+
either of that version or of any later version published by the Free
|
246 |
+
Software Foundation. If the Program does not specify a version number of
|
247 |
+
this License, you may choose any version ever published by the Free Software
|
248 |
+
Foundation.
|
249 |
+
|
250 |
+
10. If you wish to incorporate parts of the Program into other free
|
251 |
+
programs whose distribution conditions are different, write to the author
|
252 |
+
to ask for permission. For software which is copyrighted by the Free
|
253 |
+
Software Foundation, write to the Free Software Foundation; we sometimes
|
254 |
+
make exceptions for this. Our decision will be guided by the two goals
|
255 |
+
of preserving the free status of all derivatives of our free software and
|
256 |
+
of promoting the sharing and reuse of software generally.
|
257 |
+
|
258 |
+
NO WARRANTY
|
259 |
+
|
260 |
+
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
261 |
+
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
262 |
+
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
263 |
+
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
|
264 |
+
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
265 |
+
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
|
266 |
+
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
|
267 |
+
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
|
268 |
+
REPAIR OR CORRECTION.
|
269 |
+
|
270 |
+
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
271 |
+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
272 |
+
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
|
273 |
+
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
|
274 |
+
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
|
275 |
+
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
|
276 |
+
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
|
277 |
+
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
|
278 |
+
POSSIBILITY OF SUCH DAMAGES.
|
279 |
+
|
280 |
+
END OF TERMS AND CONDITIONS
|
281 |
+
|
282 |
+
How to Apply These Terms to Your New Programs
|
283 |
+
|
284 |
+
If you develop a new program, and you want it to be of the greatest
|
285 |
+
possible use to the public, the best way to achieve this is to make it
|
286 |
+
free software which everyone can redistribute and change under these terms.
|
287 |
+
|
288 |
+
To do so, attach the following notices to the program. It is safest
|
289 |
+
to attach them to the start of each source file to most effectively
|
290 |
+
convey the exclusion of warranty; and each file should have at least
|
291 |
+
the "copyright" line and a pointer to where the full notice is found.
|
292 |
+
|
293 |
+
<one line to give the program's name and a brief idea of what it does.>
|
294 |
+
Copyright (C) <year> <name of author>
|
295 |
+
|
296 |
+
This program is free software; you can redistribute it and/or modify
|
297 |
+
it under the terms of the GNU General Public License as published by
|
298 |
+
the Free Software Foundation; either version 2 of the License, or
|
299 |
+
(at your option) any later version.
|
300 |
+
|
301 |
+
This program is distributed in the hope that it will be useful,
|
302 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
303 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
304 |
+
GNU General Public License for more details.
|
305 |
+
|
306 |
+
You should have received a copy of the GNU General Public License along
|
307 |
+
with this program; if not, write to the Free Software Foundation, Inc.,
|
308 |
+
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
309 |
+
|
310 |
+
Also add information on how to contact you by electronic and paper mail.
|
311 |
+
|
312 |
+
If the program is interactive, make it output a short notice like this
|
313 |
+
when it starts in an interactive mode:
|
314 |
+
|
315 |
+
Gnomovision version 69, Copyright (C) year name of author
|
316 |
+
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
317 |
+
This is free software, and you are welcome to redistribute it
|
318 |
+
under certain conditions; type `show c' for details.
|
319 |
+
|
320 |
+
The hypothetical commands `show w' and `show c' should show the appropriate
|
321 |
+
parts of the General Public License. Of course, the commands you use may
|
322 |
+
be called something other than `show w' and `show c'; they could even be
|
323 |
+
mouse-clicks or menu items--whatever suits your program.
|
324 |
+
|
325 |
+
You should also get your employer (if you work as a programmer) or your
|
326 |
+
school, if any, to sign a "copyright disclaimer" for the program, if
|
327 |
+
necessary. Here is a sample; alter the names:
|
328 |
+
|
329 |
+
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
|
330 |
+
`Gnomovision' (which makes passes at compilers) written by James Hacker.
|
331 |
+
|
332 |
+
<signature of Ty Coon>, 1 April 1989
|
333 |
+
Ty Coon, President of Vice
|
334 |
+
|
335 |
+
This General Public License does not permit incorporating your program into
|
336 |
+
proprietary programs. If your program is a subroutine library, you may
|
337 |
+
consider it more useful to permit linking proprietary applications with the
|
338 |
+
library. If this is what you want to do, use the GNU Lesser General
|
339 |
+
Public License instead of this License.
|
lng/L2SCA/Makefile
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is a rudimentary Makefile for rebuilding the tregex distribution.
|
2 |
+
# We actually use ant (q.v.) or a Java IDE.
|
3 |
+
|
4 |
+
JAVAC = javac
|
5 |
+
JAVAFLAGS = -O -d classes -encoding utf-8
|
6 |
+
|
7 |
+
tregex:
|
8 |
+
mkdir -p classes
|
9 |
+
$(JAVAC) -classpath CLASSPATH:lib/AppleJavaExtensions.jar $(JAVAFLAGS) src/edu/stanford/nlp/*/*.java src/edu/stanford/nlp/*/*/*.java src/edu/stanford/nlp/*/*/*/*.java
|
10 |
+
cd classes ; jar -cfm ../stanford-tregex-`date +%Y-%m-%d`.jar ../src/edu/stanford/nlp/trees/tregex/gui/tregex-manifest.txt edu ; cd ..
|
11 |
+
cp stanford-tregex-`date +%Y-%m-%d`.jar stanford-tregex.jar
|
12 |
+
|
lng/L2SCA/README-L2SCA.txt
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This code is the L2 Syntactic Complexity Analyzer described in:
|
2 |
+
|
3 |
+
Lu, Xiaofei (2010). Automatic analysis of syntactic complexity in second language writing. International Journal of Corpus Linguistics, 15(4):474-496.
|
4 |
+
|
5 |
+
Version 3.3.3, released June 30, 2016
|
6 |
+
|
7 |
+
Copyright (C) 2016 Xiaofei Lu
|
8 |
+
|
9 |
+
This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
|
10 |
+
|
11 |
+
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
12 |
+
|
13 |
+
You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
14 |
+
|
15 |
+
To download the latest version of this software, follow the appropriate link at
|
16 |
+
http://www.personal.psu.edu/xxl13/download.html
|
17 |
+
|
18 |
+
==============================================================================
|
19 |
+
ABOUT
|
20 |
+
|
21 |
+
L2 Syntactic Complexity Analyzer is designed to automate syntactic complexity analysis of written English language samples produced by advanced learners of English using fourteen different measures proposed in the second language development literature. The analyzer takes a written English language sample in plain text format as input and generates 14 indices of syntactic complexity of the sample.
|
22 |
+
|
23 |
+
The analyzer is implemented in python and runs on UNIX-like (LINUX, MAC OS, or UNIX) systems with Java 1.5 and python 2.5 or higher installed. The analyzer takes as input a plain text file, counts the frequency of the following 9 structures in the text: words (W), sentences (S), verb phrases (VP), clauses (C), T-units (T), dependent clauses (DC), complex T-units (CT), coordinate phrases (CP), and complex nominals (CN), and computes the following 14 syntactic complexity indices of the text: mean length of sentence (MLS), mean length of T-unit (MLT), mean length of clause (MLC), clauses per sentence (C/S), verb phrases per T-unit (VP/T),, clauses per T-unit (C/T), dependent clauses per clause (DC/C), dependent clauses per T-unit (DC/T), T-units per sentence (T/S), complex T-unit ratio (CT/T), coordinate phrases per T-unit (CP/T), coordinate phrases per clause (CP/C), complex nominals per T-unit (CN/T), and complex nominals per clause (CP/C).
|
24 |
+
|
25 |
+
The analyzer calls the Stanford praser (Klein & Manning, 2003) to parse the input file and Tregex (Levy and Andrew, 2006) to query the parse trees. Both the Stanford parser and Tregex are bundled in this download and installation along with the appropriate licenses.
|
26 |
+
|
27 |
+
CONTENTS
|
28 |
+
|
29 |
+
[1] Running the single file analyzer
|
30 |
+
[2] Input format
|
31 |
+
[3] Output format
|
32 |
+
[4] Running the multiple file analyzer
|
33 |
+
[5] A list of the files included in this package
|
34 |
+
|
35 |
+
==============================================================================
|
36 |
+
[1] Running the single file analyzer
|
37 |
+
|
38 |
+
To run the single file analyzer, type the following at the command line:
|
39 |
+
|
40 |
+
python analyzeText.py <input_file> <output_file>
|
41 |
+
|
42 |
+
Note that the python script should be called from within this directory. To make sure everything runs correctly, run the following and compare your output with the sample1_output file in the samples/ subdirectory.
|
43 |
+
|
44 |
+
python analyzeText.py samples/sample1.txt samples/sample1_testing
|
45 |
+
==============================================================================
|
46 |
+
[2] Input format
|
47 |
+
|
48 |
+
The input file should be a clean English text in plain text format (with a .txt suffix in the filename). Sample files can be found in the "samples" sub-directory.
|
49 |
+
|
50 |
+
==============================================================================
|
51 |
+
[3] Output format
|
52 |
+
|
53 |
+
A name of the output file must be provided, but you can name it anything you like.
|
54 |
+
|
55 |
+
The first line in the output file is a comma-delimited list of 24 fields, including Filename, abbreviations of the 9 structures mentioned above, and abbreviations of the 14 syntactic complexity indices mentioned above.
|
56 |
+
|
57 |
+
The second line (and subsequent lines if analyzing multiple files in a directory) is a comma-delimited list of 24 values, including the name of the input file, the frequency counts of the 9 structures, and the values of the 14 syntactic complexity indices.
|
58 |
+
|
59 |
+
This format may be hard to read but allows for easy import to Excel or SPSS.
|
60 |
+
|
61 |
+
==============================================================================
|
62 |
+
[4] Running the multiple file analyzer
|
63 |
+
|
64 |
+
To run the multiple file analyzer, type the following at the command line:
|
65 |
+
|
66 |
+
python analyzeFolder.py <path_to_input_file_folder> <output_file>
|
67 |
+
|
68 |
+
path_to_input_file_folder is the path to the folder or directory that contains the text files you want to analyze (e.g., /home/inputFiles/). The path should end with a slash, as in the example. Only files that end with the .txt suffix will be analyzed.
|
69 |
+
|
70 |
+
Note that the python script should be called from within this directory. To make sure everything runs correctly, run the following and compare your output with the samples_output file in the samples/ subdirectory.
|
71 |
+
|
72 |
+
python analyzeFolder.py samples/ samples/samples_testing
|
73 |
+
==============================================================================
|
74 |
+
[5] A list of the files included in this package
|
75 |
+
|
76 |
+
README-L2SCA.txt - this file
|
77 |
+
|
78 |
+
analyzeText.py - the single file analyzer
|
79 |
+
|
80 |
+
analyzeFolder.py - the multiple file analyzer
|
81 |
+
|
82 |
+
samples/ - this directory includes the following sample files:
|
83 |
+
|
84 |
+
sample1.txt: an English text in plain text format
|
85 |
+
|
86 |
+
sample2.txt: another English text in plain text format
|
87 |
+
|
88 |
+
sample1_output: sample output file generated by the single file analyzer
|
89 |
+
|
90 |
+
samples_output: sample output file generated by the multiple file analyzer
|
91 |
+
|
92 |
+
All files for Tregex 3.3.1
|
93 |
+
|
94 |
+
Stanford parser 3.3.1
|
lng/L2SCA/README-gui.txt
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Tregex GUI v3.3.1 - 2014-01-04
|
2 |
+
----------------------------------------------
|
3 |
+
|
4 |
+
Copyright (c) 2003-2012 The Board of Trustees of
|
5 |
+
The Leland Stanford Junior University. All Rights Reserved.
|
6 |
+
|
7 |
+
Original core Tregex code by Roger Levy and Galen Andrew.
|
8 |
+
Original core Tsurgeon code by Roger Levy.
|
9 |
+
TregexGUI by Anna Rafferty
|
10 |
+
Support code, additional features, etc. by Chris Manning
|
11 |
+
This release prepared by John Bauer.
|
12 |
+
|
13 |
+
----------------------------
|
14 |
+
TREGEX GRAPHICAL USER INTERFACE (GUI) README
|
15 |
+
----------------------------
|
16 |
+
|
17 |
+
The Tregex GUI is a graphical user interface for Tregex and Tsurgeon.
|
18 |
+
You can access it by double- clicking on the jar file tregex.jar. For
|
19 |
+
searching large treebanks, you may need to use more memory; the script
|
20 |
+
run-tregex-gui.command includes this allocation of memory and can be run
|
21 |
+
from the command line or double-click to run on a Mac. If you still have
|
22 |
+
memory problems, you can allot more memory by opening the script in a
|
23 |
+
text editor and changing "-mx300m" to include a bigger number (e.g.,
|
24 |
+
"-mx512m"). Tregex requires Java 1.5+. Further documentation for
|
25 |
+
Tregex and Tsurgeon can be found in README-tregex.txt and
|
26 |
+
README-tsurgeon.txt, respectively.
|
27 |
+
|
28 |
+
----------------------------
|
29 |
+
LOADING TREEBANKS/TREE FILES
|
30 |
+
----------------------------
|
31 |
+
|
32 |
+
To load a file containing Penn Treebank formatted trees, choose "Load trees..." from the file menu.
|
33 |
+
Multiple tree files and/or directories may be selected. After selecting the tree files you wish to
|
34 |
+
load, press "Load with file filters" to choose what filters you would like to apply to the files.
|
35 |
+
All filters are run based on the name of the file. Possible filtering options are:
|
36 |
+
|
37 |
+
- Prefix: Load only files that start with the given character sequence
|
38 |
+
|
39 |
+
- Extension: Load only files that end with the given character sequence
|
40 |
+
|
41 |
+
- Has number in range: Loads only numbered files such that the number falls in the given range, inclusive.
|
42 |
+
Ranges can be disjoint as long as multiple ranges are comma-separated (e.g., "100-500,550-700")
|
43 |
+
|
44 |
+
File filters are combined such that all loaded files must obey all of
|
45 |
+
the filters; only one filter of any given type should be specified.
|
46 |
+
|
47 |
+
Once the tree files are loaded, their names appear in the upper left hand panel "Tree files:".
|
48 |
+
Unchecking the check boxes next to the files causes the unchecked files not to be included in
|
49 |
+
searches/tsurgeon operations. To remove all files from the tree panel, choose "Clear all files"
|
50 |
+
from the Edit menu.
|
51 |
+
|
52 |
+
----------------------------
|
53 |
+
PERFORMING TREGEX SEARCHES
|
54 |
+
----------------------------
|
55 |
+
|
56 |
+
To perform a Tregex search, load the files you would like to search and type a Tregex pattern
|
57 |
+
in the "Pattern:" box in the top middle of the window. Press "Help" beneath the Pattern box
|
58 |
+
for information about Tregex syntax. After you have typed the pattern, press "Search" to
|
59 |
+
find all matches to the given pattern.
|
60 |
+
|
61 |
+
By default, trees that contain at least one match are displayed in the "Matches:" panel in the
|
62 |
+
top right of the window, and the first matching tree is graphically displayed in the bottom
|
63 |
+
portion of the window. Click on a match in the Match panel to display it graphically. In the
|
64 |
+
graphical display, matched nodes in the tree are displayed in a different color than other nodes.
|
65 |
+
To display only the matched subtrees, choose "Preferences..." (Mac, from the Application menu) or
|
66 |
+
"Options..." (other OS, under Tools), and check "Show only matched portions of the tree". You must
|
67 |
+
rerun the search to switch between showing only matched portions and showing full trees.
|
68 |
+
|
69 |
+
In preferences, other display options can also be set, such as the colors, size, and font used by
|
70 |
+
the graphical display.
|
71 |
+
|
72 |
+
----------------------------
|
73 |
+
USING TSURGEON
|
74 |
+
----------------------------
|
75 |
+
|
76 |
+
Tsurgeon modifications can also be performed using Interactive Tregex. To enable Tsurgeon, choose
|
77 |
+
"Preferences..." from the File menu and check "Enable Tsurgeon". You can now run Tsurgeon scripts.
|
78 |
+
Tsurgeon commands must be paired with a Tregex pattern that names the nodes on which modifications
|
79 |
+
will be performed. Type the Tregex pattern in the Pattern box, and type the modifications you would
|
80 |
+
like to make in the "Tsurgeon script:" box. Then click "Run script" to perform the modifications.
|
81 |
+
Each Tsurgeon operation must appear on a separate line in the Tsurgeon script box. Press "Help" for
|
82 |
+
some information about Tsurgeon operation syntax.
|
83 |
+
|
84 |
+
|
85 |
+
----------------------------
|
86 |
+
SAVING RESULTS
|
87 |
+
----------------------------
|
88 |
+
|
89 |
+
You can save the results of a Tregex search or Tsurgeon operation by choosing "Save matches..." from the
|
90 |
+
File menu. This saves all trees in the Matches panel in Penn Treebank form. "Save matched sentences..." saves
|
91 |
+
the matches in sentence String form, just as they show up in the matches panel.
|
92 |
+
|
93 |
+
You can also save a log of the number of matches found for each pattern you have searched. By clicking the
|
94 |
+
"Statistics" button in the middle of the screen, below the Tsurgeon buttons, you can see a table of the patterns
|
95 |
+
for which you have searched, the number of trees that each matched, and the number of overall matches that were
|
96 |
+
found. To save this information in a tab delimited text file, choose "Save statistics..." from the File menu.
|
97 |
+
|
98 |
+
All three save options save files in the encoding specified in the Preferences panel for loading tree files.
|
99 |
+
|
100 |
+
----------------------------
|
101 |
+
MULTILANGUAGE SUPPORT
|
102 |
+
----------------------------
|
103 |
+
|
104 |
+
Some multilanguage support is built into Tregex GUI, and most languages can be read by the GUI. To enable
|
105 |
+
this support, choose go to Preferences (Mac, under the application menu) or Options (other OS, under the Tools menu).
|
106 |
+
Several options may need to be changed: tree reader factory, head finder, font, and encoding. Several possible
|
107 |
+
tree reader factories and head finders are provided; you may also specify your own. Two common languages you may be
|
108 |
+
trying to use are Chinese or Arabic; any head finder or tree reader factory beginning with "Chinese" or "Arabic" will
|
109 |
+
work for these languages, and additionally, CTBTreeReaderFactory is compatible with many Chinese treebanks. Based on
|
110 |
+
your choice of head finder and tree reader factory, the Tregex GUI will guess if you may need a different font and/or
|
111 |
+
text encoding. If a different text encoding is usually used for your selections, you will be prompted as to what text
|
112 |
+
encoding you would like to use. This may also be specified directly in the Preferences panel.
|
113 |
+
|
114 |
+
|
115 |
+
|
116 |
+
----------------------------
|
117 |
+
QUESTIONS
|
118 |
+
----------------------------
|
119 |
+
|
120 |
+
For more information on Tregex or Tsurgeon, read README-tregex.txt and README-tsurgeon.txt, and also look at the javadocs
|
121 |
+
suggested in those files. For questions about this distribution, please contact Stanford's JavaNLP group at
|
122 |
+
[email protected]. We provide assistance on a best-effort basis.
|
123 |
+
|
124 |
+
----------------------------
|
125 |
+
LICENSE
|
126 |
+
----------------------------
|
127 |
+
|
128 |
+
Tregex GUI
|
129 |
+
Copyright (c) 2007-2011 The Board of Trustees of
|
130 |
+
The Leland Stanford Junior University. All Rights Reserved.
|
131 |
+
|
132 |
+
This program is free software; you can redistribute it and/or
|
133 |
+
modify it under the terms of the GNU General Public License
|
134 |
+
as published by the Free Software Foundation; either version 2
|
135 |
+
of the License, or (at your option) any later version.
|
136 |
+
|
137 |
+
This program is distributed in the hope that it will be useful,
|
138 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
139 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
140 |
+
GNU General Public License for more details.
|
141 |
+
|
142 |
+
You should have received a copy of the GNU General Public License
|
143 |
+
along with this program; if not, write to the Free Software
|
144 |
+
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
145 |
+
|
146 |
+
For more information, bug reports, fixes, contact:
|
147 |
+
Christopher Manning
|
148 |
+
Dept of Computer Science, Gates 1A
|
149 |
+
Stanford CA 94305-9010
|
150 |
+
USA
|
151 | |
152 |
+
http://www-nlp.stanford.edu/software/tregex.shtml
|
153 |
+
|
154 |
+
|
155 |
+
----------------------------
|
156 |
+
CHANGES
|
157 |
+
----------------------------
|
158 |
+
|
159 |
+
2014-01-04 3.3.1 Bugfix release, new createSubtree tsurgeon
|
160 |
+
operation
|
161 |
+
|
162 |
+
2013-11-12 3.3.0 Allow a TregexMatcher to have its own
|
163 |
+
HeadFinder, useful for the dependencies
|
164 |
+
|
165 |
+
2013-06-19 3.2.0 Fix for tsurgeon number reading bug
|
166 |
+
|
167 |
+
2013-04-04 2.0.6 Update to maintain compatibility
|
168 |
+
|
169 |
+
2012-11-11 2.0.5 Efficiency improvements
|
170 |
+
|
171 |
+
2012-07-09 2.0.4 Minor bug fixes
|
172 |
+
|
173 |
+
2012-05-22 2.0.3 Rebuilt to be compatible with everything.
|
174 |
+
|
175 |
+
2012-03-09 2.0.2 Efficiency improvements
|
176 |
+
|
177 |
+
2011-12-16 2.0.1 Fix bug in matchesAt, fix bug in category
|
178 |
+
function, add macros
|
179 |
+
|
180 |
+
2011-09-14 2.0.0 Efficiency improvements, include semgrex.
|
181 |
+
|
182 |
+
2011-05-15 1.4.4 Rebuilt to be compatible with everything.
|
183 |
+
|
184 |
+
2011-05-15 1.4.3 Rebuilt to be compatible with everything.
|
185 |
+
|
186 |
+
2011-04-17 1.4.2 Rebuilt to be compatible with tagger, parser,
|
187 |
+
and corenlp.
|
188 |
+
|
189 |
+
2010-11-18 1.4.1 Small fixes and improvements (improved help
|
190 |
+
screens, multipattern Tsurgeon scripts with
|
191 |
+
comments introduced by % supported, unclosed
|
192 |
+
regex no longer crashes GUI, support character
|
193 |
+
encodings in script files, fix bug in tregex
|
194 |
+
matching immediate domination path, TregexGUI
|
195 |
+
now shows filename and line number of each
|
196 |
+
match in matches panel)
|
197 |
+
|
198 |
+
2009-09-30 1.4 GUI slider for tree size, generalized relabel
|
199 |
+
command (incompatibly), __ and @ now supported
|
200 |
+
in path constraints; bugfixes.
|
201 |
+
|
202 |
+
2008-05-06 1.1 Several bug fixes; addition of browse trees
|
203 |
+
function, improved copy/paste and drag and
|
204 |
+
drop support; misc. feature additions
|
205 |
+
|
206 |
+
2007-09-20 1.0 Initial release
|
lng/L2SCA/README-tregex.txt
ADDED
@@ -0,0 +1,429 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Tregex v3.3.1 - 2014-01-04
|
2 |
+
----------------------------------------------
|
3 |
+
|
4 |
+
Copyright (c) 2003-2012 The Board of Trustees of
|
5 |
+
The Leland Stanford Junior University. All Rights Reserved.
|
6 |
+
|
7 |
+
Original core Tregex code by Roger Levy and Galen Andrew.
|
8 |
+
Original core Tsurgeon code by Roger Levy.
|
9 |
+
GUI by Anna Rafferty
|
10 |
+
Support code, additional features, etc. by Chris Manning
|
11 |
+
This release prepared by John Bauer.
|
12 |
+
|
13 |
+
This package contains Tregex and Tsurgeon.
|
14 |
+
|
15 |
+
Tregex is a Tgrep2-style utility for matching patterns in trees. It can
|
16 |
+
be run in a graphical user interface, from the command line using the
|
17 |
+
TregexPattern main method, or used programmatically in java code via the
|
18 |
+
TregexPattern, TregexMatcher and TregexPatternCompiler classes.
|
19 |
+
|
20 |
+
As of version 1.2, the Tsurgeon tree-transformation utility is bundled
|
21 |
+
together with Tregex. See the file README.tsurgeon for details.
|
22 |
+
|
23 |
+
Java version 1.6 is required to use Tregex. If you really want to use
|
24 |
+
Tregex under an earlier version of Java, look into RetroWeaver:
|
25 |
+
|
26 |
+
http://retroweaver.sourceforge.net/
|
27 |
+
|
28 |
+
|
29 |
+
QUICKSTART
|
30 |
+
-----------------------------------------------
|
31 |
+
|
32 |
+
Programmatic use, command-line use, and GUI-use are supported. To access the
|
33 |
+
graphical interface for Tsurgeon and Tregex, double-click the tregex.jar file.
|
34 |
+
Some help (particularly with syntax) is provided within the program; for further
|
35 |
+
assistance, see README-gui.txt and the documentation mentioned below.
|
36 |
+
|
37 |
+
A full explanation of pattern syntax and usage is given in the javadocs
|
38 |
+
(particularly TregexPattern), and some of this information is also presented in
|
39 |
+
the TREGEX SYNTAX section below. As a quick example of usage,
|
40 |
+
the following line will scan an English PennTreebank annotated corpus
|
41 |
+
and print all nodes representing a verb phrase dominating a past-tense
|
42 |
+
verb and a noun phrase.
|
43 |
+
|
44 |
+
./tregex.sh 'VP < VBD < NP' corpus_dir
|
45 |
+
|
46 |
+
|
47 |
+
CONTENTS
|
48 |
+
-----------------------------------------------
|
49 |
+
|
50 |
+
README-tregex.txt
|
51 |
+
|
52 |
+
This file.
|
53 |
+
|
54 |
+
|
55 |
+
README-tsurgeon.txt
|
56 |
+
|
57 |
+
Documentation for Tsurgeon, a tool for modifying trees.
|
58 |
+
|
59 |
+
README-gui.txt
|
60 |
+
|
61 |
+
Documentation for the graphical interface for Tregex and Tsurgeon tools.
|
62 |
+
|
63 |
+
LICENSE.txt
|
64 |
+
|
65 |
+
Tregex is licensed under the GNU General Public License.
|
66 |
+
|
67 |
+
stanford-tregex.jar
|
68 |
+
|
69 |
+
This is a JAR file containing all the Stanford classes necessary to
|
70 |
+
run tregex.
|
71 |
+
|
72 |
+
src
|
73 |
+
|
74 |
+
A directory containing the Java 1.6 source code for the Tregex
|
75 |
+
distribution.
|
76 |
+
|
77 |
+
javadoc
|
78 |
+
|
79 |
+
Javadocs for the distribution. In particular, look at the javadocs
|
80 |
+
for the class edu.stanford.nlp.trees.tregex.TregexPattern. The
|
81 |
+
first part of that class's javadoc describes syntax and semantics
|
82 |
+
for relations, node labels, node names, and variable groups. The
|
83 |
+
docs for the main method describe command-line options.
|
84 |
+
|
85 |
+
tregex.sh
|
86 |
+
|
87 |
+
a shell script for invoking the Tregex tree search tool.
|
88 |
+
|
89 |
+
tsurgeon.sh
|
90 |
+
|
91 |
+
a shell script for invoking the Tsurgeon tree transformation tool.
|
92 |
+
|
93 |
+
run-tregex-gui.command
|
94 |
+
|
95 |
+
A command file that can be double-clicked on a Mac to start the gui.
|
96 |
+
|
97 |
+
run-tregex-gui.bat
|
98 |
+
|
99 |
+
A bat file that can be double-clicked on a PC to start the gui.
|
100 |
+
|
101 |
+
examples
|
102 |
+
|
103 |
+
a directory containing several sample files to show Tsurgeon operation:
|
104 |
+
- atree
|
105 |
+
a sample natural-language tree in Penn Treebank annotation style.
|
106 |
+
- exciseNP
|
107 |
+
- renameVerb
|
108 |
+
- relabelWithGroupName
|
109 |
+
Sample tree-transformation operation files for Tsurgeon. See
|
110 |
+
README-tsurgeon.txt for more information about the contents of these
|
111 |
+
files.
|
112 |
+
|
113 |
+
|
114 |
+
TREGEX
|
115 |
+
-----------------------------------------------
|
116 |
+
Tregex Pattern Syntax and Uses
|
117 |
+
|
118 |
+
Using a Tregex pattern, you can find only those trees that match the pattern you're
|
119 |
+
looking for. The following table shows the symbols that are allowed in the pattern,
|
120 |
+
and below there is more information about using these patterns.
|
121 |
+
|
122 |
+
Table of Symbols and Meanings:
|
123 |
+
A << B
|
124 |
+
A dominates B
|
125 |
+
A >> B
|
126 |
+
A is dominated by B
|
127 |
+
A < B
|
128 |
+
A immediately dominates B
|
129 |
+
A > B
|
130 |
+
A is immediately dominated by B
|
131 |
+
A $ B
|
132 |
+
A is a sister of B (and not equal to B)
|
133 |
+
A .. B
|
134 |
+
A precedes B
|
135 |
+
A . B
|
136 |
+
A immediately precedes B
|
137 |
+
A ,, B
|
138 |
+
A follows B
|
139 |
+
A , B
|
140 |
+
A immediately follows B
|
141 |
+
A <<, B
|
142 |
+
B is a leftmost descendent of A
|
143 |
+
A <<- B
|
144 |
+
B is a rightmost descendent of A
|
145 |
+
A >>, B
|
146 |
+
A is a leftmost descendent of B
|
147 |
+
A >>- B
|
148 |
+
A is a rightmost descendent of B
|
149 |
+
A <, B
|
150 |
+
B is the first child of A
|
151 |
+
A >, B
|
152 |
+
A is the first child of B
|
153 |
+
A <- B
|
154 |
+
B is the last child of A
|
155 |
+
A >- B
|
156 |
+
A is the last child of B
|
157 |
+
A <` B
|
158 |
+
B is the last child of A
|
159 |
+
A >` B
|
160 |
+
A is the last child of B
|
161 |
+
A <i B
|
162 |
+
B is the ith child of A (i > 0)
|
163 |
+
A >i B
|
164 |
+
A is the ith child of B (i > 0)
|
165 |
+
A <-i B
|
166 |
+
B is the ith-to-last child of A (i > 0)
|
167 |
+
A >-i B
|
168 |
+
A is the ith-to-last child of B (i > 0)
|
169 |
+
A <: B
|
170 |
+
B is the only child of A
|
171 |
+
A >: B
|
172 |
+
A is the only child of B
|
173 |
+
A <<: B
|
174 |
+
A dominates B via an unbroken chain (length > 0) of unary local trees.
|
175 |
+
A >>: B
|
176 |
+
A is dominated by B via an unbroken chain (length > 0) of unary local trees.
|
177 |
+
A $++ B
|
178 |
+
A is a left sister of B (same as $.. for context-free trees)
|
179 |
+
A $-- B
|
180 |
+
A is a right sister of B (same as $,, for context-free trees)
|
181 |
+
A $+ B
|
182 |
+
A is the immediate left sister of B (same as $. for context-free trees)
|
183 |
+
A $- B
|
184 |
+
A is the immediate right sister of B (same as $, for context-free trees)
|
185 |
+
A $.. B
|
186 |
+
A is a sister of B and precedes B
|
187 |
+
A $,, B
|
188 |
+
A is a sister of B and follows B
|
189 |
+
A $. B
|
190 |
+
A is a sister of B and immediately precedes B
|
191 |
+
A $, B
|
192 |
+
A is a sister of B and immediately follows B
|
193 |
+
A <+(C) B
|
194 |
+
A dominates B via an unbroken chain of (zero or more) nodes matching description C
|
195 |
+
A >+(C) B
|
196 |
+
A is dominated by B via an unbroken chain of (zero or more) nodes matching description C
|
197 |
+
A .+(C) B
|
198 |
+
A precedes B via an unbroken chain of (zero or more) nodes matching description C
|
199 |
+
A ,+(C) B
|
200 |
+
A follows B via an unbroken chain of (zero or more) nodes matching description C
|
201 |
+
A <<# B
|
202 |
+
B is a head of phrase A
|
203 |
+
A >># B
|
204 |
+
A is a head of phrase B
|
205 |
+
A <# B
|
206 |
+
B is the immediate head of phrase A
|
207 |
+
A ># B
|
208 |
+
A is the immediate head of phrase B
|
209 |
+
A == B
|
210 |
+
A and B are the same node
|
211 |
+
A : B
|
212 |
+
[this is a pattern-segmenting operator that places no constraints on the relationship between A and B]
|
213 |
+
|
214 |
+
Label descriptions can be literal strings, which much match labels exactly, or regular
|
215 |
+
expressions in regular expression bars: /regex/. Literal string matching proceeds as
|
216 |
+
String equality. In order to prevent ambiguity with other Tregex symbols, only standard
|
217 |
+
"identifiers" are allowed as literals, i.e., strings matching [a-zA-Z]([a-zA-Z0-9_])* .
|
218 |
+
If you want to use other symbols, you can do so by using a regular expression instead of
|
219 |
+
a literal string. A disjunctive list of literal strings can be given separated by '|'.
|
220 |
+
The special string '__' (two underscores) can be used to match any node. (WARNING!!
|
221 |
+
Use of the '__' node description may seriously slow down search.) If a label description
|
222 |
+
is preceeded by '@', the label will match any node whose basicCategory matches the description.
|
223 |
+
NB: A single '@' thus scopes over a disjunction specified by '|': @NP|VP means things with basic category NP or VP.
|
224 |
+
|
225 |
+
Label description regular expressions are matched as find(), as in Perl/tgrep;
|
226 |
+
you need to specify ^ or $ to constrain matches.
|
227 |
+
|
228 |
+
In a chain of relations, all relations are relative to the first node in the chain.
|
229 |
+
For example, (S < VP < NP) means an S over a VP and also over an NP. If instead what
|
230 |
+
you want is an S above a VP above an NP, you should write S < (VP < NP).
|
231 |
+
|
232 |
+
Nodes can be grouped using parentheses '(' and ')' as in S < (NP $++ VP) to match an S
|
233 |
+
over an NP, where the NP has a VP as a right sister.
|
234 |
+
|
235 |
+
Boolean relational operators
|
236 |
+
|
237 |
+
Relations can be combined using the '&' and '|' operators, negated with the '!' operator,
|
238 |
+
and made optional with the '?' operator. Thus (NP < NN | < NNS) will match an NP node
|
239 |
+
dominating either an NN or an NNS. (NP > S & $++ VP) matches an NP that is both under
|
240 |
+
an S and has a VP as a right sister.
|
241 |
+
|
242 |
+
Relations can be grouped using brackets '[' and ']'. So the expression
|
243 |
+
|
244 |
+
NP [< NN | < NNS] & > S
|
245 |
+
|
246 |
+
matches an NP that (1) dominates either an NN or an NNS, and (2) is under an S. Without
|
247 |
+
brackets, & takes precedence over |, and equivalent operators are left-associative. Also
|
248 |
+
note that & is the default combining operator if the operator is omitted in a chain of
|
249 |
+
relations, so that the two patterns are equivalent:
|
250 |
+
(S < VP < NP)
|
251 |
+
(S < VP & < NP)
|
252 |
+
|
253 |
+
As another example, (VP < VV | < NP % NP) can be written explicitly as (VP [< VV | [< NP & % NP] ] ).
|
254 |
+
|
255 |
+
Relations can be negated with the '!' operator, in which case the expression will match
|
256 |
+
only if there is no node satisfying the relation. For example (NP !< NNP) matches only
|
257 |
+
NPs not dominating an NNP. Label descriptions can also be negated with '!': (NP < !NNP|NNS)
|
258 |
+
matches NPs dominating some node that is not an NNP or an NNS.
|
259 |
+
|
260 |
+
Relations can be made optional with the '?' operator. This way the expression will match even
|
261 |
+
if the optional relation is not satisfied. This is useful when used together with node naming
|
262 |
+
(see below).
|
263 |
+
|
264 |
+
|
265 |
+
Basic Categories
|
266 |
+
|
267 |
+
In order to consider only the "basic category" of a tree label, i.e. to ignore functional tags
|
268 |
+
or other annotations on the label, prefix that node's description with the @ symbol. For example
|
269 |
+
(@NP < @/NN.?/). This can only be used for individual nodes; if you want all nodes to use the
|
270 |
+
basic category, it would be more efficient to use a TreeNormalizer to remove functional tags
|
271 |
+
before passing the tree to the TregexPattern.
|
272 |
+
|
273 |
+
|
274 |
+
Segmenting patterns
|
275 |
+
|
276 |
+
The ":" operator allows you to segment a pattern into two pieces. This can simplify your pattern
|
277 |
+
writing. For example, the pattern S : NP matches only those S nodes in trees that also have an NP node.
|
278 |
+
|
279 |
+
|
280 |
+
Naming nodes
|
281 |
+
|
282 |
+
Nodes can be given names (a.k.a. handles) using '='. A named node will be stored in a map that
|
283 |
+
maps names to nodes so that if a match is found, the node corresponding to the named node can
|
284 |
+
be extracted from the map. For example (NP < NNP=name) will match an NP dominating an NNP
|
285 |
+
and after a match is found, the map can be queried with the name to retreived the matched node
|
286 |
+
using {@link TregexMatcher#getNode(Object o)} with (String) argument "name" (not "=name"). Note
|
287 |
+
that you are not allowed to name a node that is under the scope of a negation operator (the
|
288 |
+
semantics would be unclear, since you can't store a node that never gets matched to). Trying to
|
289 |
+
do so will cause a ParseException to be thrown. Named nodes can be put within the scope of an
|
290 |
+
optional operator.
|
291 |
+
|
292 |
+
Named nodes that refer back to previous named nodes need not have a node description -- this is
|
293 |
+
known as "backreferencing". In this case, the expression will match only when all instances of
|
294 |
+
the same name get matched to the same tree node. For example, the pattern:
|
295 |
+
|
296 |
+
(@NP <, (@NP $+ (/,/ $+ (@NP $+ /,/=comma))) <- =comma)
|
297 |
+
|
298 |
+
matches only an NP dominating exactly the sequence NP, NP; the mother NP cannot have any other
|
299 |
+
daughters. Multiple backreferences are allowed. If the node with no node description does not
|
300 |
+
refer to a previously named node, there will be no error, the expression simply will not match
|
301 |
+
anything.
|
302 |
+
|
303 |
+
Another way to refer to previously named nodes is with the "link" symbol: '~'. A link is like a
|
304 |
+
backreference, except that instead of having to be <i>equal to</i> the referred node, the
|
305 |
+
current node only has to match the label of the referred to node. A link cannot have a node
|
306 |
+
description, i.e. the '~' symbol must immediately follow a relation symbol.
|
307 |
+
|
308 |
+
|
309 |
+
Variable Groups
|
310 |
+
|
311 |
+
If you write a node description using a regular expression, you can assign its matching groups to
|
312 |
+
variable names. If more than one node has a group assigned to the same variable name, then matching
|
313 |
+
will only occur when all such groups capture the same string. This is useful for enforcing
|
314 |
+
coindexation constraints. The syntax is:
|
315 |
+
|
316 |
+
/ <regex-stuff> /#<group-number>%<variable-name>
|
317 |
+
|
318 |
+
For example, the pattern (designed for Penn Treebank trees):
|
319 |
+
|
320 |
+
@SBAR < /^WH.*-([0-9]+)$/#1%index<<(__=empty < (/^-NONE-/< /^\\*T\\*-([0-9]+)$/#1%index))
|
321 |
+
|
322 |
+
will match only such that the WH- node under the SBAR is coindexed with the trace node that gets the name empty.
|
323 |
+
|
324 |
+
|
325 |
+
MISCELLANEOUS
|
326 |
+
-----------------------------------------------
|
327 |
+
|
328 |
+
Head Finders
|
329 |
+
|
330 |
+
To use the headship relations <# ># <<# >># correctly it is
|
331 |
+
important to specify a HeadFinder class appropriate to the trees
|
332 |
+
that you are searching. For information about how to specify a
|
333 |
+
HeadFinder class at the command line or through the API, please read
|
334 |
+
the javadocs for the class
|
335 |
+
edu.stanford.nlp.trees.tregex.TregexPattern. The following
|
336 |
+
HeadFinder classes are included with the Tregex distribution:
|
337 |
+
|
338 |
+
Penn Treebank of English (http://www.cis.upenn.edu/~treebank/):
|
339 |
+
|
340 |
+
edu.stanford.nlp.trees.CollinsHeadFinder (default)
|
341 |
+
|
342 |
+
Penn Treebank of Chinese (http://www.cis.upenn.edu/~chinese/):
|
343 |
+
|
344 |
+
edu.stanford.nlp.trees.international.pennchinese.ChineseHeadFinder
|
345 |
+
|
346 |
+
Penn Treebank of Arabic (http://www.ircs.upenn.edu/arabic/):
|
347 |
+
|
348 |
+
edu.stanford.nlp.trees.international.arabic.ArabicHeadFinder
|
349 |
+
|
350 |
+
NEGRA (http://www.coli.uni-saarland.de/projects/sfb378/negra-corpus/)
|
351 |
+
|
352 |
+
and
|
353 |
+
|
354 |
+
TIGER (http://www.ims.uni-stuttgart.de/projekte/TIGER/TIGERCorpus/)
|
355 |
+
|
356 |
+
treebanks of German (these can use the same headfinder):
|
357 |
+
|
358 |
+
edu.stanford.nlp.trees.international.negra.NegraHeadFinder
|
359 |
+
|
360 |
+
Tuebingen Treebank of Written German (http://www.sfs.uni-tuebingen.de/en_tuebadz.shtml):
|
361 |
+
|
362 |
+
edu.stanford.nlp.trees.international.tuebadz.TueBaDZHeadFinder
|
363 |
+
|
364 |
+
|
365 |
+
Tdiff
|
366 |
+
|
367 |
+
TregexGUI supports a constituent diff'ing method--similar to the UNIX diff command--for trees. To
|
368 |
+
enable Tdiff:
|
369 |
+
1) Clear the tree file list: File -> Clear tree file list
|
370 |
+
2) Enable Tdiff: Options -> Tdiff
|
371 |
+
3) Load two (2) files using the "File -> Load" dialog.
|
372 |
+
4) Select "Browse" on the main display
|
373 |
+
|
374 |
+
The GUI will display differences between each pair of trees in the two files. As such, the two files must
|
375 |
+
contain the same number of trees.
|
376 |
+
|
377 |
+
The first file in the tree file list is treated as the reference. Trees from the second file
|
378 |
+
will be displayed in the GUI, with bracketing differences highlighted in blue. Below the tree,
|
379 |
+
constituents in the reference tree that do not appear in the tree from the second file are shown
|
380 |
+
as lines below each respective span.
|
381 |
+
|
382 |
+
Tregex searches are supported and apply to the trees in the second file.
|
383 |
+
|
384 |
+
This feature was designed for debugging and analyzing parser output.
|
385 |
+
|
386 |
+
THANKS
|
387 |
+
-----------------------------------------------
|
388 |
+
|
389 |
+
Thanks to the members of the Stanford Natural Language Processing Lab
|
390 |
+
for great collaborative work on Java libraries for natural language
|
391 |
+
processing.
|
392 |
+
|
393 |
+
http://nlp.stanford.edu/javanlp/
|
394 |
+
|
395 |
+
LICENSE
|
396 |
+
-----------------------------------------------
|
397 |
+
|
398 |
+
Tregex, Tsurgeon, and Interactive Tregex
|
399 |
+
Copyright (c) 2003-2012 The Board of Trustees of
|
400 |
+
The Leland Stanford Junior University. All Rights Reserved.
|
401 |
+
|
402 |
+
This program is free software; you can redistribute it and/or
|
403 |
+
modify it under the terms of the GNU General Public License
|
404 |
+
as published by the Free Software Foundation; either version 2
|
405 |
+
of the License, or (at your option) any later version.
|
406 |
+
|
407 |
+
This program is distributed in the hope that it will be useful,
|
408 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
409 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
410 |
+
GNU General Public License for more details.
|
411 |
+
|
412 |
+
You should have received a copy of the GNU General Public License
|
413 |
+
along with this program; if not, write to the Free Software
|
414 |
+
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
415 |
+
|
416 |
+
For more information, bug reports, fixes, contact:
|
417 |
+
Christopher Manning
|
418 |
+
Dept of Computer Science, Gates 1A
|
419 |
+
Stanford CA 94305-9010
|
420 |
+
USA
|
421 | |
422 |
+
http://www-nlp.stanford.edu/software/tregex.shtml
|
423 |
+
|
424 |
+
|
425 |
+
CONTACT
|
426 |
+
-----------------------------------------------
|
427 |
+
|
428 |
+
For questions about this distribution, please contact Stanford's JavaNLP group at
|
429 |
+
[email protected]. We provide assistance on a best-effort basis.
|
lng/L2SCA/README-tsurgeon.txt
ADDED
@@ -0,0 +1,529 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Tsurgeon v3.3.1 - 2014-01-04
|
2 |
+
----------------------------------------------
|
3 |
+
|
4 |
+
Copyright (c) 2003-2012 The Board of Trustees of
|
5 |
+
The Leland Stanford Junior University. All Rights Reserved.
|
6 |
+
|
7 |
+
Original core Tregex code by Roger Levy and Galen Andrew.
|
8 |
+
Original core Tsurgeon code by Roger Levy.
|
9 |
+
GUI by Anna Rafferty
|
10 |
+
Support code, additional features, etc. by Chris Manning
|
11 |
+
This release prepared by John Bauer.
|
12 |
+
|
13 |
+
This package contains Tregex and Tsurgeon.
|
14 |
+
|
15 |
+
Tregex is a Tgrep2-style utility for matching patterns in trees. It can
|
16 |
+
be run in a graphical user interface, from the command line using the
|
17 |
+
TregexPattern main method, or used programmatically in java code via the
|
18 |
+
TregexPattern, TregexMatcher and TregexPatternCompiler classes.
|
19 |
+
|
20 |
+
As of version 1.2, the Tsurgeon tree-transformation utility is bundled
|
21 |
+
together with Tregex. See the file README.tsurgeon for details.
|
22 |
+
|
23 |
+
Java version 1.6 is required to use Tregex. If you really want to use
|
24 |
+
Tregex under an earlier version of Java, look into RetroWeaver:
|
25 |
+
|
26 |
+
http://retroweaver.sourceforge.net/
|
27 |
+
|
28 |
+
TSURGEON
|
29 |
+
----------------------------------------------
|
30 |
+
Tsurgeon is a tool for modifying trees that match a particular Tregex
|
31 |
+
pattern. Further documentation for Tregex and Tregex GUI can be found in
|
32 |
+
README-tregex.txt and README-gui.txt, respectively.
|
33 |
+
|
34 |
+
----------------------------------------------
|
35 |
+
|
36 |
+
Brief description:
|
37 |
+
|
38 |
+
Takes some trees, tries to match one or more tregex expressions to
|
39 |
+
each tree, and for each successful match applies some surgical
|
40 |
+
operations to the tree. Pretty-prints each resulting tree (after all
|
41 |
+
successful match/operation sets have applied) to standard output.
|
42 |
+
|
43 |
+
|
44 |
+
A simple example:
|
45 |
+
|
46 |
+
./tsurgeon.csh -treeFile atree exciseNP renameVerb
|
47 |
+
|
48 |
+
-----------------------------------------
|
49 |
+
RUNNING TREGEX
|
50 |
+
-----------------------------------------
|
51 |
+
|
52 |
+
Program Command Line Options:
|
53 |
+
|
54 |
+
-treeFile <filename>
|
55 |
+
|
56 |
+
specify the name of the file that has the trees you want to transform.
|
57 |
+
|
58 |
+
-po <matchPattern> <operation>
|
59 |
+
|
60 |
+
Apply a single operation to every tree using the specified match
|
61 |
+
pattern and the specified operation.
|
62 |
+
|
63 |
+
-s
|
64 |
+
|
65 |
+
Prints the output trees one per line, instead of pretty-printed.
|
66 |
+
|
67 |
+
The arguments are then Tsurgeon scripts.
|
68 |
+
Each argument should be the name of a transformation file that contains a list of pattern
|
69 |
+
and transformation operation list pairs. That is, it is a sequence of pairs of a
|
70 |
+
TregexPattern pattern on one or more lines, then a
|
71 |
+
blank line (empty or whitespace), then a list of transformation operations one per line
|
72 |
+
(as specified by Tsurgeon syntax below to apply when the pattern is matched,
|
73 |
+
and then another blank line (empty or whitespace).
|
74 |
+
Note the need for blank lines: The code crashes if they are not present as separators
|
75 |
+
(although the blank line at the end of the file can be omitted).
|
76 |
+
The script file can include comment lines, either whole comment lines or
|
77 |
+
trailing comments introduced by %, which extend to the end of line. A needed percent
|
78 |
+
mark in partterns or operations can be escaped by a preceding backslash.
|
79 |
+
|
80 |
+
-----------------------------------------
|
81 |
+
TSURGEON SYNTAX
|
82 |
+
-----------------------------------------
|
83 |
+
|
84 |
+
Legal operation syntax and semantics (see Examples section for further detail):
|
85 |
+
|
86 |
+
delete <name_1> <name_2> ... <name_m>
|
87 |
+
|
88 |
+
For each name_i, deletes the node it names and everything below it.
|
89 |
+
|
90 |
+
prune <name_1> <name_2> ... <name_m>
|
91 |
+
|
92 |
+
For each name_i, prunes out the node it names. Pruning differs from
|
93 |
+
deletion in that if pruning a node causes its parent to have no
|
94 |
+
children, then the parent is in turn pruned too.
|
95 |
+
|
96 |
+
excise <name1> <name2>
|
97 |
+
|
98 |
+
The name1 node should either dominate or be the same as the name2
|
99 |
+
node. This excises out everything from name1 to name2. All the
|
100 |
+
children of name2 go into the parent of name1, where name1 was.
|
101 |
+
|
102 |
+
relabel <name> <new-label>
|
103 |
+
|
104 |
+
Relabels the node to have the new label. There are three possible forms
|
105 |
+
for the new-label:
|
106 |
+
relabel nodeX VP - for changing a node label to an alphanumeric
|
107 |
+
string, relabel nodeX /''/ - for relabeling a node to something that
|
108 |
+
isn't a valid identifier without quoting, and relabel nodeX
|
109 |
+
/^VB(.*)$/verb\/$1/ - for regular expression based relabeling. In the
|
110 |
+
last case, all matches of the regular expression against the node
|
111 |
+
label are replaced with the replacement String. This has the semantics
|
112 |
+
of Java/Perl's replaceAll: you may use capturing groups and put them
|
113 |
+
in replacements with $n. Also, as in the example, you can escape a
|
114 |
+
slash in the middle of the second and third forms with \/ and \\.
|
115 |
+
This last version lets you make a new label that is an arbitrary
|
116 |
+
String function of the original label and additional characters that
|
117 |
+
you supply.
|
118 |
+
|
119 |
+
insert <name> <position>
|
120 |
+
insert <tree> <position>
|
121 |
+
|
122 |
+
inserts the named node, or a manually specified tree (see below for
|
123 |
+
syntax), into the position specified. Right now the only ways to
|
124 |
+
specify position are:
|
125 |
+
|
126 |
+
$+ <name> the left sister of the named node
|
127 |
+
$- <name> the right sister of the named node
|
128 |
+
>i <name> the i_th daughter of the named node.
|
129 |
+
>-i <name> the i_th daughter, counting from the right, of the named node.
|
130 |
+
|
131 |
+
move <name> <position>
|
132 |
+
|
133 |
+
moves the named node into the specified position. To be precise, it
|
134 |
+
deletes (*NOT* prunes) the node from the tree, and re-inserts it
|
135 |
+
into the specified position. See above for how to specify position
|
136 |
+
|
137 |
+
replace <name1> <name2>
|
138 |
+
|
139 |
+
deletes name1 and inserts a copy of name2 in its place.
|
140 |
+
|
141 |
+
adjoin <tree> <target-node>
|
142 |
+
|
143 |
+
adjoins the specified auxiliary tree (see below for syntax) into the
|
144 |
+
target node specified. The daughters of the target node will become
|
145 |
+
the daughters of the foot of the auxiliary tree.
|
146 |
+
|
147 |
+
adjoinH <tree> <target-node>
|
148 |
+
|
149 |
+
similar to adjoin, but preserves the target node and makes it the root
|
150 |
+
of <tree>. (It is still accessible as <code>name</code>. The root of
|
151 |
+
the auxiliary tree is ignored.)
|
152 |
+
|
153 |
+
adjoinF <tree> <target-node>
|
154 |
+
|
155 |
+
similar to adjoin, but preserves the target node and makes it the foot
|
156 |
+
of <tree>. (It is still accessible as <code>name</code>, and retains
|
157 |
+
its status as parent of its children. The foot of the auxiliary tree
|
158 |
+
is ignored.)
|
159 |
+
|
160 |
+
coindex <name_1> <name_2> ... <name_m>
|
161 |
+
|
162 |
+
Puts a (Penn Treebank style) coindexation suffix of the form "-N" on
|
163 |
+
each of nodes name_1 through name_m. The value of N will be
|
164 |
+
automatically generated in reference to the existing coindexations
|
165 |
+
in the tree, so that there is never an accidental clash of
|
166 |
+
indices across things that are not meant to be coindexed.
|
167 |
+
|
168 |
+
-----------------------------------------
|
169 |
+
|
170 |
+
Syntax for trees to be inserted or adjoined:
|
171 |
+
|
172 |
+
|
173 |
+
A tree to be adjoined in can be specified with LISP-like
|
174 |
+
parenthetical-bracketing tree syntax such as those used for the Penn
|
175 |
+
Treebank. For example, for the NP "the dog" to be inserted you might
|
176 |
+
use the syntax
|
177 |
+
|
178 |
+
(NP (Det the) (N dog))
|
179 |
+
|
180 |
+
That's all that there is for a tree to be inserted. Auxiliary trees
|
181 |
+
(a la Tree Adjoining Grammar) must also have exactly one frontier node
|
182 |
+
ending in the character "@", which marks it as the "foot" node for
|
183 |
+
adjunction. Final instances of the character "@" in terminal node labels
|
184 |
+
will be removed from the actual label of the tree.
|
185 |
+
|
186 |
+
For example, if you wanted to adjoin the adverb "breathlessly" into a
|
187 |
+
VP, you might specify the following auxiliary tree:
|
188 |
+
|
189 |
+
(VP (Adv breathlessly) VP@ )
|
190 |
+
|
191 |
+
All other instances of "@" in terminal nodes must be escaped (i.e.,
|
192 |
+
appear as \@); this escaping will be removed by tsurgeon.
|
193 |
+
|
194 |
+
In addition, any node of a tree can be named (the same way as in
|
195 |
+
tregex), by appending =<name> to the node label. That name can be
|
196 |
+
referred to by subsequent tsurgeon operations triggered by the same
|
197 |
+
match. All other instances of "=" in node labels must be escaped
|
198 |
+
(i.e., appear as \=); this escaping will be removed by tsurgeon. For
|
199 |
+
example, if you want to insert an NP trace somewhere and coindex it
|
200 |
+
with a node named "antecedent" you might say
|
201 |
+
|
202 |
+
insert (NP (-NONE- *T*=trace)) <node-location>
|
203 |
+
coindex trace antecedent $
|
204 |
+
|
205 |
+
-----------------------------------------
|
206 |
+
Examples of Tsurgeon operations:
|
207 |
+
|
208 |
+
Tree (used in all examples):
|
209 |
+
(ROOT
|
210 |
+
(S
|
211 |
+
(NP (NNP Maria_Eugenia_Ochoa_Garcia))
|
212 |
+
(VP (VBD was)
|
213 |
+
(VP (VBN arrested)
|
214 |
+
(PP (IN in)
|
215 |
+
(NP (NNP May)))))
|
216 |
+
(. .)))
|
217 |
+
|
218 |
+
Apply delete:
|
219 |
+
VP < PP=prep
|
220 |
+
delete prep
|
221 |
+
Result:
|
222 |
+
(ROOT
|
223 |
+
(S
|
224 |
+
(NP (NNP Maria_Eugenia_Ochoa_Garcia))
|
225 |
+
(VP (VBD was)
|
226 |
+
(VP (VBN arrested)
|
227 |
+
(. .)))
|
228 |
+
The PP node directly dominated by a VP is removed, as is
|
229 |
+
everything under it.
|
230 |
+
|
231 |
+
Apply prune:
|
232 |
+
S < (NP < NNP=noun)
|
233 |
+
prune noun
|
234 |
+
Result:
|
235 |
+
(ROOT
|
236 |
+
(S
|
237 |
+
(VP (VBD was)
|
238 |
+
(VP (VBN arrested)
|
239 |
+
(PP (IN in)
|
240 |
+
(NP (NNP May)))))
|
241 |
+
(. .)))
|
242 |
+
The NNP node is removed, and since this results in the NP above it
|
243 |
+
having no terminal children, the NP node is deleted as well.
|
244 |
+
Note: This is different from delete in which the NP above the NNP
|
245 |
+
would remain.
|
246 |
+
|
247 |
+
Apply excise:
|
248 |
+
VP < PP=prep
|
249 |
+
excise prep prep
|
250 |
+
Result:
|
251 |
+
(ROOT
|
252 |
+
(S
|
253 |
+
(NP (NNP Maria_Eugenia_Ochoa_Garcia))
|
254 |
+
(VP (VBD was)
|
255 |
+
(VP (VBN arrested)
|
256 |
+
(IN in)
|
257 |
+
(NP (NNP May)))))
|
258 |
+
(. .)))
|
259 |
+
The PP node is removed, and all of its children are added in the
|
260 |
+
place it was previously located. Excise removes all the nodes from
|
261 |
+
the first named node to the second named node, and the children of
|
262 |
+
the second node are added as children of the parent of the first node.
|
263 |
+
Thus, for another example:
|
264 |
+
VP=verb < PP=prep
|
265 |
+
excise verb prep
|
266 |
+
Result:
|
267 |
+
(ROOT
|
268 |
+
(S
|
269 |
+
(NP (NNP Maria_Eugenia_Ochoa_Garcia))
|
270 |
+
(VP (VBD was)
|
271 |
+
(IN in)
|
272 |
+
(NP (NNP May)))
|
273 |
+
(. .)))
|
274 |
+
|
275 |
+
|
276 |
+
Apply relabel:
|
277 |
+
VP=v < PP=prep
|
278 |
+
relabel prep verbPrep
|
279 |
+
Result:
|
280 |
+
(ROOT
|
281 |
+
(S
|
282 |
+
(NP (NNP Maria_Eugenia_Ochoa_Garcia))
|
283 |
+
(VP (VBD was)
|
284 |
+
(VP (VBN arrested)
|
285 |
+
(verbPrep (IN in)
|
286 |
+
(NP (NNP May)))))
|
287 |
+
(. .)))
|
288 |
+
The label for the node called prep (PP) is changed to verbPrep.
|
289 |
+
The other form of relabel uses regular expressions; consider the following
|
290 |
+
operation:
|
291 |
+
/^VB.+/=v
|
292 |
+
relabel v /^VB(.*)$/ #1
|
293 |
+
Result:
|
294 |
+
(ROOT
|
295 |
+
(S
|
296 |
+
(NP (NNP Maria_Eugenia_Ochoa_Garcia))
|
297 |
+
(VP (D was)
|
298 |
+
(VP (N arrested)
|
299 |
+
(PP (IN in)
|
300 |
+
(NP (NNP May)))))
|
301 |
+
(. .)))
|
302 |
+
The Tregex pattern matches all nodes that begin "VB" and have at least one
|
303 |
+
more character. The Tsurgeon operation then matches the node label to the
|
304 |
+
regular expression "^VB(.*)$" and selects the text matching the first part
|
305 |
+
that is not completely specified in the pattern. In this case, that is the
|
306 |
+
part matching the wildcard (.*), which matches all characters after the VB.
|
307 |
+
The node is then relabeled with that part of the text, causing, for example,
|
308 |
+
"VBD" to be relabeled "D". The "#1" specifies that the name of the node
|
309 |
+
should be the first group in the regex.
|
310 |
+
|
311 |
+
Apply insert (shown here with inserting a node, but could also be a tree):
|
312 |
+
S < (NP < (NNP=name !$- DET))
|
313 |
+
insert (DET Ms.) $+ name
|
314 |
+
Result:
|
315 |
+
(ROOT
|
316 |
+
(S
|
317 |
+
(NP (DET Ms.)
|
318 |
+
(NNP Maria_Eugenia_Ochoa_Garcia))
|
319 |
+
(VP (VBD was)
|
320 |
+
(VP (VBN arrested)
|
321 |
+
(PP (IN in)
|
322 |
+
(NP (NNP May)))))
|
323 |
+
(. .)))
|
324 |
+
The pattern matches the NNP node that is directly dominated by an NP
|
325 |
+
(which is directly dominated by an S) and is not a direct right sister
|
326 |
+
of a DET. Thus, the (DET Ms.) node is inserted immediately to the left
|
327 |
+
of that NNP node, as specified by "$+ name". "$+" is the location and
|
328 |
+
"name" describes what node the location is with respect to.
|
329 |
+
Note: Tsurgeon will re-search for matches after each run of the script;
|
330 |
+
thus, cycles may occur, causing the program to not terminate. The key
|
331 |
+
is to write patterns that match prior to the changes you would like to
|
332 |
+
make but that do not match afterwards. If the clause "!$- DET" had been
|
333 |
+
left out in this example, Tsurgeon would have matched the pattern after
|
334 |
+
every insert operation, causing an infinite number of DETs to be added.
|
335 |
+
|
336 |
+
Apply move:
|
337 |
+
VP=verb < PP=prep
|
338 |
+
move prep $- verb
|
339 |
+
Result:
|
340 |
+
(ROOT
|
341 |
+
(S
|
342 |
+
(NP (NNP Maria_Eugenia_Ochoa_Garcia))
|
343 |
+
(VP (VBD was)
|
344 |
+
(VP (VBN arrested)))
|
345 |
+
(PP (IN in)
|
346 |
+
(NP (NNP May)))
|
347 |
+
(. .)))
|
348 |
+
The PP is moved out of the VP that dominates it and added as a direct right
|
349 |
+
sister of the VP. As for insert, "$-" specifies the location for prep while
|
350 |
+
"verb" specifies what that location is relative to.
|
351 |
+
Note: "move" is a macro operation that deletes the given node and then inserts
|
352 |
+
it. "move" does not use prune, and thus any branches that now lack terminals will
|
353 |
+
remain rather than being removed.
|
354 |
+
|
355 |
+
Apply replace:
|
356 |
+
S < (NP=name < NNP)
|
357 |
+
replace name (NP (DET A) (NN woman))
|
358 |
+
Result:
|
359 |
+
(ROOT
|
360 |
+
(S
|
361 |
+
(NP (DET A)
|
362 |
+
(NN woman))
|
363 |
+
(VP (VBD was)
|
364 |
+
(VP (VBN arrested)
|
365 |
+
(PP (IN in)
|
366 |
+
(NP (NNP May)))))
|
367 |
+
(. .)))
|
368 |
+
"name" is matched to an NP that is dominated by an S and dominates an NNP, and
|
369 |
+
a new subtree ("(NP (DET A) (NN woman))") is added in the place where "name" was.
|
370 |
+
Note: This operation is vulnerable to falling into an infinite loop. See the note
|
371 |
+
concerning the "insert" operation and how patterns are matched.
|
372 |
+
|
373 |
+
Apply adjoin:
|
374 |
+
S < (NP=name < NNP)
|
375 |
+
adjoin (NP (DET A) (NN woman) NP@) name
|
376 |
+
Result:
|
377 |
+
(ROOT
|
378 |
+
(S
|
379 |
+
(NP (DET A)
|
380 |
+
(NN woman)
|
381 |
+
(NP (NNP Maria_Eugenia_Ochoa_Garcia)))
|
382 |
+
(VP (VBD was)
|
383 |
+
(VP (VBN arrested)
|
384 |
+
(PP (IN in)
|
385 |
+
(NP (NNP May)))))
|
386 |
+
(. .)))
|
387 |
+
First, the NP is matched to the NP dominating the NNP tag. Then, the specified
|
388 |
+
tree ("(NP (DET A) (NN woman) NP@)") is placed in that location. The "@" symbol
|
389 |
+
specifies that the children of the original NP node ("name") are to be placed
|
390 |
+
as children of a new NP node that is directly to the right of (NN woman). If
|
391 |
+
the specified tree were "(NP (DET A) (NN woman) VP@)" then the child
|
392 |
+
(NNP Maria_Eugenia_Ochoa_Garcia) would appear under a VP. Exactly one "@" node
|
393 |
+
must appear in the specified tree in order to indicate where to place the node
|
394 |
+
from the original tree.
|
395 |
+
|
396 |
+
Apply adjoinH:
|
397 |
+
S < (NP=name < NNP)
|
398 |
+
adjoinH ((NP (DET A) (NN woman) NP@)) name
|
399 |
+
Result:
|
400 |
+
(ROOT
|
401 |
+
(S
|
402 |
+
(NP (NP (DET A)
|
403 |
+
(NN woman)
|
404 |
+
(NP (NNP Maria_Eugenia_Ochoa_Garcia))))
|
405 |
+
(VP (VBD was)
|
406 |
+
(VP (VBN arrested)
|
407 |
+
(PP (IN in)
|
408 |
+
(NP (NNP May)))))
|
409 |
+
(. .)))
|
410 |
+
This operation differs from adjoin in that it retains the named node (in this
|
411 |
+
case, "name"). The named node is made the root of the specified tree, resulting
|
412 |
+
in two NP nodes dominating the DET in this example whereas only one was present
|
413 |
+
in the previous example. Note that the specified tree is wrapped in an extra
|
414 |
+
pair of parentheses in order to show the syntax for retaining the named node.
|
415 |
+
If the extra parentheses were not there and the specified tree was, for example,
|
416 |
+
(VP (DET A) (NN woman) NP@), the VP would be ignored in order to retain an NP as
|
417 |
+
the root. Thus, in this case, "adjoinH (VP (DET A) (NN woman) NP@) name" and
|
418 |
+
"adjoinH ((DET A) (NN woman) NP@) name" both produce the same tree:
|
419 |
+
(ROOT
|
420 |
+
(S
|
421 |
+
(NP (DET A)
|
422 |
+
(NN woman)
|
423 |
+
(NP (NNP Maria_Eugenia_Ochoa_Garcia)))
|
424 |
+
(VP (VBD was)
|
425 |
+
(VP (VBN arrested)
|
426 |
+
(PP (IN in)
|
427 |
+
(NP (NNP May)))))
|
428 |
+
(. .)))
|
429 |
+
|
430 |
+
|
431 |
+
Apply adjoinF:
|
432 |
+
S < (NP=name < NNP)
|
433 |
+
adjoinF (NP(DET A) (NN woman) @) name
|
434 |
+
Result:
|
435 |
+
(ROOT
|
436 |
+
(S
|
437 |
+
(NP (DET A)
|
438 |
+
(NN woman)
|
439 |
+
(NP (NNP Maria_Eugenia_Ochoa_Garcia)))
|
440 |
+
(VP (VBD was)
|
441 |
+
(VP (VBN arrested)
|
442 |
+
(PP (IN in)
|
443 |
+
(NP (NNP May)))))
|
444 |
+
(. .)))
|
445 |
+
This operation is very similar to adjoin and adjoinH, but this time the original
|
446 |
+
named node ("name" in this case) is maintained as the root of the subtree that
|
447 |
+
is adjoined. Thus, no node label needs to be given in front of the "@" and if
|
448 |
+
one is given, it will be ignored. For instance, "adjoinF (NP(DET A) (NN woman) VP@) name"
|
449 |
+
would still produce the same tree as above, despite the VP preceding the @.
|
450 |
+
|
451 |
+
Apply coindex:
|
452 |
+
NP=node < NNP=name
|
453 |
+
coindex node name
|
454 |
+
Result:
|
455 |
+
(ROOT
|
456 |
+
(S
|
457 |
+
(NP-1 (NNP-1 Maria_Eugenia_Ochoa_Garcia))
|
458 |
+
(VP (VBD was)
|
459 |
+
(VP (VBN arrested)
|
460 |
+
(PP (IN in)
|
461 |
+
(NP-2 (NNP-2 May)))))
|
462 |
+
(. .)))
|
463 |
+
This causes the named nodes to be numbered such that all nodes that are part
|
464 |
+
of the same match have the same number and all matches have distinct new names.
|
465 |
+
We had two instances of an NP dominating an NNP in this example, and they were
|
466 |
+
renamed such that NP-i < NNP-i for each match, with 1 <= i <= number of matches.
|
467 |
+
|
468 |
+
-----------------------------------------
|
469 |
+
TSURGEON SCRIPTS
|
470 |
+
-----------------------------------------
|
471 |
+
Script format:
|
472 |
+
|
473 |
+
Tsurgeon scripts are a combination of a Tregex pattern to match and a series
|
474 |
+
of Tsurgeon operations to perform on that match. The first line of a Tsurgeon
|
475 |
+
script should be the Tregex pattern. This should be followed by a blank line,
|
476 |
+
and then each subsequent line may contain one Tsurgeon operation. Tsurgeon
|
477 |
+
operations should not be separated by blank lines. The following is an example
|
478 |
+
of correctly formatted script:
|
479 |
+
|
480 |
+
S < NP=node < NNP=name
|
481 |
+
|
482 |
+
relabel node NP_NAME
|
483 |
+
coindex node name
|
484 |
+
|
485 |
+
|
486 |
+
Comments:
|
487 |
+
|
488 |
+
The character % introduces a comment that extends to the end of the
|
489 |
+
line. All other intended uses of % must be escaped as \% .
|
490 |
+
|
491 |
+
-----------------------------------------
|
492 |
+
CONTACT
|
493 |
+
-----------------------------------------
|
494 |
+
|
495 |
+
For questions about this distribution, please contact Stanford's JavaNLP group at
|
496 |
+
[email protected]. We provide assistance on a best-effort basis.
|
497 |
+
|
498 |
+
|
499 |
+
-----------------------------------------
|
500 |
+
LICENSE
|
501 |
+
-----------------------------------------
|
502 |
+
|
503 |
+
Tregex, Tsurgeon, and Interactive Tregex
|
504 |
+
Copyright (c) 2003-2011 The Board of Trustees of
|
505 |
+
The Leland Stanford Junior University. All Rights Reserved.
|
506 |
+
|
507 |
+
This program is free software; you can redistribute it and/or
|
508 |
+
modify it under the terms of the GNU General Public License
|
509 |
+
as published by the Free Software Foundation; either version 2
|
510 |
+
of the License, or (at your option) any later version.
|
511 |
+
|
512 |
+
This program is distributed in the hope that it will be useful,
|
513 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
514 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
515 |
+
GNU General Public License for more details.
|
516 |
+
|
517 |
+
You should have received a copy of the GNU General Public License
|
518 |
+
along with this program; if not, write to the Free Software
|
519 |
+
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
520 |
+
|
521 |
+
For more information, bug reports, fixes, contact:
|
522 |
+
Christopher Manning
|
523 |
+
Dept of Computer Science, Gates 1A
|
524 |
+
Stanford CA 94305-9010
|
525 |
+
USA
|
526 | |
527 |
+
http://www-nlp.stanford.edu/software/tregex.shtml
|
528 |
+
|
529 |
+
|
lng/L2SCA/Semgrex.ppt
ADDED
Binary file (285 kB). View file
|
|
lng/L2SCA/analyzeFolder.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This script analyzes all text files (with the .txt suffix only) in a single folder or directory.
|
3 |
+
|
4 |
+
It counts the occurrences of the following 9 structures in each text: words (W), sentences (S), verb phrases (VP), clauses (C), T-units (T), dependent clauses (DC), complex T-units (CT), coordinate phrases (CP), and complex nominals (CN).
|
5 |
+
|
6 |
+
These frequency counts are then used to compute the following 14 syntactic complexity indices of each text: mean length of sentence (MLS), mean length of T-unit (MLT), mean length of clause (MLC), clauses per sentence (C/S), verb phrases per T-unit (VP/T), clauses per T-unit (C/T), dependent clauses per clause (DC/C), dependent clauses per T-unit (DC/T), T-units per sentence (T/S), complex T-unit ratio (CT/T), coordinate phrases per T-unit (CP/T), coordinate phrases per clause (CP/C), complex nominals per T-unit (CN/T), and complex nominals per clause (CN/C).
|
7 |
+
|
8 |
+
To run the script, type the following at the command line:
|
9 |
+
python analyzeText.py inputFileDirectory outputFileName
|
10 |
+
|
11 |
+
inputFileDirectory is the path to the directory or folder that contains the text files you want to analyze (e.g., /home/inputFiles/). The path should end with a slash, as in the example. outputFileName is the name you want to assign to the output file. Both must be provided.
|
12 |
+
|
13 |
+
The first line of the output file will be a comma-delimited list of 24 fields (including Filename, abbreviations of the 9 structures, and abbreviations of the 14 syntactic complexity indices). The subsequent lines of the file will each provide a comma-delimited list of 24 values for one input file (including the name of the file, frequency counts of the 9 structures, and the values of the 14 syntactic complexity indices). This format may be hard to read but allows easy import to Excel or SPSS.
|
14 |
+
"""
|
15 |
+
|
16 |
+
import sys, os, subprocess, glob, re
|
17 |
+
|
18 |
+
#a function to divide two numbers from strings
|
19 |
+
def division(x,y):
|
20 |
+
if float(x)==0 or float(y)==0:
|
21 |
+
return 0
|
22 |
+
return float(x)/float(y)
|
23 |
+
|
24 |
+
#the following is a list of tregex patterns for various structures
|
25 |
+
|
26 |
+
#sentence (S)
|
27 |
+
s="'ROOT'"
|
28 |
+
|
29 |
+
#verb phrase (VP)
|
30 |
+
vp="'VP > S|SINV|SQ'"
|
31 |
+
vp_q="'MD|VBZ|VBP|VBD > (SQ !< VP)'"
|
32 |
+
|
33 |
+
#clause (C)
|
34 |
+
c="'S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])]'"
|
35 |
+
|
36 |
+
#T-unit (T)
|
37 |
+
t="'S|SBARQ|SINV|SQ > ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP]'"
|
38 |
+
|
39 |
+
#dependent clause (DC)
|
40 |
+
dc="'SBAR < (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])])'"
|
41 |
+
|
42 |
+
#complex T-unit (CT)
|
43 |
+
ct="'S|SBARQ|SINV|SQ [> ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP]] << (SBAR < (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP\
|
44 |
+
|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])]))'"
|
45 |
+
|
46 |
+
#coordinate phrase (CP)
|
47 |
+
cp="'ADJP|ADVP|NP|VP < CC'"
|
48 |
+
|
49 |
+
#complex nominal (CN)
|
50 |
+
cn1="'NP !> NP [<< JJ|POS|PP|S|VBG | << (NP $++ NP !$+ CC)]'"
|
51 |
+
cn2="'SBAR [<# WHNP | <# (IN < That|that|For|for) | <, S] & [$+ VP | > VP]'"
|
52 |
+
cn3="'S < (VP <# VBG|TO) $+ VP'"
|
53 |
+
|
54 |
+
#fragment clause
|
55 |
+
fc="'FRAG > ROOT !<< (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])])'"
|
56 |
+
|
57 |
+
#fragment T-unit
|
58 |
+
ft="'FRAG > ROOT !<< (S|SBARQ|SINV|SQ > ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP])'"
|
59 |
+
|
60 |
+
#list of patterns to search for
|
61 |
+
patternlist=[s,vp,c,t,dc,ct,cp,cn1,cn2,cn3,fc,ft,vp_q]
|
62 |
+
|
63 |
+
#location of the Stanford parser
|
64 |
+
parserPath="stanford-parser-full-2014-01-04/lexparser.sh"
|
65 |
+
|
66 |
+
#path to the directory or folder containing input files
|
67 |
+
directoryPath=sys.argv[1]
|
68 |
+
|
69 |
+
#output file name
|
70 |
+
outputFile=open(sys.argv[2],"w")
|
71 |
+
|
72 |
+
#write a list of 24 comma-delimited fields to the output file
|
73 |
+
fields="Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C"
|
74 |
+
outputFile.write(fields+"\n")
|
75 |
+
|
76 |
+
#process text files in the directory one by one
|
77 |
+
for filename in glob.glob( os.path.join(directoryPath, '*.txt') ):
|
78 |
+
print("Processing "+filename+"...")
|
79 |
+
|
80 |
+
#Extract the name of the file being processed
|
81 |
+
output=filename.split('/')[-1]
|
82 |
+
|
83 |
+
#name a temporary file to hold the parse trees of the input file
|
84 |
+
parsedFile=filename+".parsed"
|
85 |
+
|
86 |
+
#parse the input file
|
87 |
+
command=parserPath + " " + filename + " > " + parsedFile
|
88 |
+
a=subprocess.getoutput(command).split('\n')[-1].split()
|
89 |
+
|
90 |
+
#list of counts of the patterns
|
91 |
+
patterncount=[]
|
92 |
+
|
93 |
+
#query the parse trees using the tregex patterns
|
94 |
+
for pattern in patternlist:
|
95 |
+
command = "./tregex.sh " + pattern + " " + parsedFile + " -C -o"
|
96 |
+
count = subprocess.getoutput(command).split('\n')[-1]
|
97 |
+
patterncount.append(int(count))
|
98 |
+
|
99 |
+
#update frequencies of complex nominals, clauses, and T-units
|
100 |
+
patterncount[7]=patterncount[-4]+patterncount[-5]+patterncount[-6]
|
101 |
+
patterncount[2]=patterncount[2]+patterncount[-3]
|
102 |
+
patterncount[3]=patterncount[3]+patterncount[-2]
|
103 |
+
patterncount[1]=patterncount[1]+patterncount[-1]
|
104 |
+
|
105 |
+
#word count
|
106 |
+
infile=open(parsedFile,"r")
|
107 |
+
content=infile.read()
|
108 |
+
w=len(re.findall("\([A-Z]+\$? [^\)\(]+\)",content))
|
109 |
+
infile.close()
|
110 |
+
|
111 |
+
#add frequencies of words and other structures to output string
|
112 |
+
output+=","+str(w) #number of words
|
113 |
+
for count in patterncount[:8]:
|
114 |
+
output+=","+str(count)
|
115 |
+
|
116 |
+
#list of frequencies of structures other than words
|
117 |
+
[s,vp,c,t,dc,ct,cp,cn]=patterncount[:8]
|
118 |
+
|
119 |
+
#compute the 14 syntactic complexity indices
|
120 |
+
mls=division(w,s)
|
121 |
+
mlt=division(w,t)
|
122 |
+
mlc=division(w,c)
|
123 |
+
c_s=division(c,s)
|
124 |
+
vp_t=division(vp,t)
|
125 |
+
c_t=division(c,t)
|
126 |
+
dc_c=division(dc,c)
|
127 |
+
dc_t=division(dc,t)
|
128 |
+
t_s=division(t,s)
|
129 |
+
ct_t=division(ct,t)
|
130 |
+
cp_t=division(cp,t)
|
131 |
+
cp_c=division(cp,c)
|
132 |
+
cn_t=division(cn,t)
|
133 |
+
cn_c=division(cn,c)
|
134 |
+
|
135 |
+
#add syntactic complexity indices to output string
|
136 |
+
for ratio in [mls,mlt,mlc,c_s,vp_t,c_t,dc_c,dc_t,t_s,ct_t,cp_t,cp_c,cn_t,cn_c]:
|
137 |
+
output+=","+str("%.4F" % ratio)
|
138 |
+
|
139 |
+
#write output string to output file
|
140 |
+
outputFile.write(output+"\n")
|
141 |
+
|
142 |
+
#delete the temporary file holding the parse trees
|
143 |
+
command="rm "+parsedFile
|
144 |
+
os.popen(command)
|
145 |
+
|
146 |
+
outputFile.close()
|
147 |
+
|
148 |
+
print("Done. Output was saved to " + sys.argv[2] +".")
|
lng/L2SCA/analyzeText.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This script analyzes a single plain text file.
|
3 |
+
|
4 |
+
It counts the occurrences of the following 9 structures in the text: words (W), sentences (S), verb phrases (VP), clauses (C), T-units (T), dependent clauses (DC), complex T-units (CT), coordinate phrases (CP), and complex nominals (CN).
|
5 |
+
|
6 |
+
These frequency counts are then used to compute the following 14 syntactic complexity indices of the text: mean length of sentence (MLS), mean length of T-unit (MLT), mean length of clause (MLC), clauses per sentence (C/S), verb phrases per T-unit (VP/T), clauses per T-unit (C/T), dependent clauses per clause (DC/C), dependent clauses per T-unit (DC/T), T-units per sentence (T/S), complex T-unit ratio (CT/T), coordinate phrases per T-unit (CP/T), coordinate phrases per clause (CP/C), complex nominals per T-unit (CN/T), and complex nominals per clause (CN/C).
|
7 |
+
|
8 |
+
To run the script, type the following at the command line:
|
9 |
+
python analyzeText.py inputFileName outputFileName
|
10 |
+
|
11 |
+
inputFileName is the name of your input text file. outputFileName is the name you want to assign to the output file. Both names must be provided.
|
12 |
+
|
13 |
+
The output file will contain 2 lines. The first line is a comma-delimited list of 24 fields (including Filename, abbreviations of the 9 structures, and abbreviations of the 14 syntactic complexity indices). The second line is a comma-delimited list of 24 values (including the name of the input file, frequency counts of the 9 structures, and the values of the 14 syntactic complexity indices). This format may be hard to read but allows easy import to Excel or SPSS.
|
14 |
+
"""
|
15 |
+
|
16 |
+
import sys, os, subprocess, re, tempfile
|
17 |
+
|
18 |
+
#a function to divide two numbers from strings
|
19 |
+
def division(x,y):
|
20 |
+
if float(x)==0 or float(y)==0:
|
21 |
+
return 0
|
22 |
+
return float(x)/float(y)
|
23 |
+
|
24 |
+
#the following is a list of tregex patterns for various structures
|
25 |
+
|
26 |
+
#sentence (S)
|
27 |
+
s="ROOT"
|
28 |
+
|
29 |
+
#verb phrase (VP)
|
30 |
+
vp="VP > S|SINV|SQ"
|
31 |
+
vp_q="MD|VBZ|VBP|VBD > (SQ !< VP)"
|
32 |
+
|
33 |
+
#clause (C)
|
34 |
+
c="S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])]"
|
35 |
+
|
36 |
+
#T-unit (T)
|
37 |
+
t="S|SBARQ|SINV|SQ > ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP]"
|
38 |
+
|
39 |
+
#dependent clause (DC)
|
40 |
+
dc="SBAR < (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])])"
|
41 |
+
|
42 |
+
#complex T-unit (CT)
|
43 |
+
ct="S|SBARQ|SINV|SQ [> ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP]] << (SBAR < (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])]))"
|
44 |
+
|
45 |
+
#coordinate phrase (CP)
|
46 |
+
cp="ADJP|ADVP|NP|VP < CC"
|
47 |
+
|
48 |
+
#complex nominal (CN)
|
49 |
+
cn1="NP !> NP [<< JJ|POS|PP|S|VBG | << (NP $++ NP !$+ CC)]"
|
50 |
+
cn2="SBAR [<# WHNP | <# (IN < That|that|For|for) | <, S] & [$+ VP | > VP]"
|
51 |
+
cn3="S < (VP <# VBG|TO) $+ VP"
|
52 |
+
|
53 |
+
#fragment clause
|
54 |
+
fc="FRAG > ROOT !<< (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])])"
|
55 |
+
|
56 |
+
#fragment T-unit
|
57 |
+
ft="FRAG > ROOT !<< (S|SBARQ|SINV|SQ > ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP])"
|
58 |
+
|
59 |
+
#list of patterns to search for
|
60 |
+
patternlist=[s,vp,c,t,dc,ct,cp,cn1,cn2,cn3,fc,ft,vp_q]
|
61 |
+
|
62 |
+
pre_path = 'lng/L2SCA'
|
63 |
+
|
64 |
+
#location of the Stanford parser
|
65 |
+
parserPath= os.path.join(pre_path, "stanford-parser-full-2014-01-04/lexparser.sh")
|
66 |
+
|
67 |
+
def sca(input_text):
|
68 |
+
inputFile = '/tmp/%s.txt'%next(tempfile._get_candidate_names())
|
69 |
+
with open(inputFile, 'w') as f:
|
70 |
+
f.write(input_text + '\n')
|
71 |
+
|
72 |
+
#extract the name of the file being processed
|
73 |
+
output = []
|
74 |
+
|
75 |
+
#name a temporary file to hold the parse trees of the input file
|
76 |
+
parsedFile=inputFile+".parsed"
|
77 |
+
|
78 |
+
#parse the input file
|
79 |
+
command=[parserPath, inputFile]
|
80 |
+
with open(parsedFile, 'w') as f:
|
81 |
+
subprocess.run(command, stdout = f,
|
82 |
+
stderr = subprocess.DEVNULL
|
83 |
+
)
|
84 |
+
|
85 |
+
|
86 |
+
#list of counts of the patterns
|
87 |
+
patterncount=[]
|
88 |
+
|
89 |
+
#query the parse trees using the tregex patterns
|
90 |
+
for pattern in patternlist:
|
91 |
+
command = [os.path.join(pre_path, "tregex.sh"), pattern, parsedFile, "-C", "-o"]
|
92 |
+
out = subprocess.run(command, check = True, stdout = subprocess.PIPE, stderr = subprocess.DEVNULL)
|
93 |
+
if len(out.stdout) > 0:
|
94 |
+
count = int(out.stdout)
|
95 |
+
else:
|
96 |
+
count = 0
|
97 |
+
patterncount.append(count)
|
98 |
+
|
99 |
+
#update frequencies of complex nominals, clauses, and T-units
|
100 |
+
patterncount[7]=patterncount[-4]+patterncount[-5]+patterncount[-6]
|
101 |
+
patterncount[2]=patterncount[2]+patterncount[-3]
|
102 |
+
patterncount[3]=patterncount[3]+patterncount[-2]
|
103 |
+
patterncount[1]=patterncount[1]+patterncount[-1]
|
104 |
+
|
105 |
+
#word count
|
106 |
+
infile=open(parsedFile,"r")
|
107 |
+
content=infile.read()
|
108 |
+
w=len(re.findall("\([A-Z]+\$? [^\)\(]+\)",content))
|
109 |
+
infile.close()
|
110 |
+
|
111 |
+
#add frequencies of words and other structures to output string
|
112 |
+
output.append(int(w))
|
113 |
+
for count in patterncount[:8]:
|
114 |
+
output.append(int(count))
|
115 |
+
|
116 |
+
#list of frequencies of structures other than words
|
117 |
+
[s,vp,c,t,dc,ct,cp,cn]=patterncount[:8]
|
118 |
+
|
119 |
+
#compute the 14 syntactic complexity indices
|
120 |
+
mls=division(w,s)
|
121 |
+
mlt=division(w,t)
|
122 |
+
mlc=division(w,c)
|
123 |
+
c_s=division(c,s)
|
124 |
+
vp_t=division(vp,t)
|
125 |
+
c_t=division(c,t)
|
126 |
+
dc_c=division(dc,c)
|
127 |
+
dc_t=division(dc,t)
|
128 |
+
t_s=division(t,s)
|
129 |
+
ct_t=division(ct,t)
|
130 |
+
cp_t=division(cp,t)
|
131 |
+
cp_c=division(cp,c)
|
132 |
+
cn_t=division(cn,t)
|
133 |
+
cn_c=division(cn,c)
|
134 |
+
|
135 |
+
#add syntactic complexity indices to output string
|
136 |
+
for ratio in [mls,mlt,mlc,c_s,vp_t,c_t,dc_c,dc_t,t_s,ct_t,cp_t,cp_c,cn_t,cn_c]:
|
137 |
+
output.append(ratio)
|
138 |
+
|
139 |
+
#list of 24 comma-delimited fields
|
140 |
+
# fields="Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C"
|
141 |
+
|
142 |
+
#delete the temporary file holding the parse trees
|
143 |
+
os.remove(inputFile)
|
144 |
+
os.remove(parsedFile)
|
145 |
+
|
146 |
+
return output
|
lng/L2SCA/examples/atree
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
(VP (VP (VBZ Try) (NP (NP (DT this) (NN wine)) (CC and) (NP (DT these) (NNS snails)))) (PUNCT .))
|
lng/L2SCA/examples/exciseNP
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
NP < (NP=np < NNS) < (NP=np1 < NN)
|
2 |
+
|
3 |
+
excise np np
|
4 |
+
excise np1 np1
|
5 |
+
|
6 |
+
|
lng/L2SCA/examples/relabelWithGroupName
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/^VB.+/=v
|
2 |
+
|
3 |
+
relabel v /^VB(.*)$/ #1
|
4 |
+
|
lng/L2SCA/examples/renameVerb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
VBZ=vbz $ NP
|
2 |
+
|
3 |
+
relabel vbz MYVERB
|
lng/L2SCA/lib/ABOUT-AppleJavaExtensions.txt
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AppleJavaExtensions.jar provides some stub methods to allow
|
2 |
+
compilation of code which contains Java methods that reference Mac OS
|
3 |
+
X specific Java APIs on any platform. This is needed only for
|
4 |
+
compilation of the class edu.stanford.nlp.trees.tregex.gui.OSXAdapter .
|
5 |
+
Using this class and the links to Apple-specific technologies is
|
6 |
+
required to allow the Mac version of Tregex to behave like a normal
|
7 |
+
Mac application in responding to About and Preferences... menu items.
|
8 |
+
|
9 |
+
This library is not needed or used at runtime on any platform.
|
10 |
+
|
11 |
+
If you'd prefer not to have these complications in the source for your
|
12 |
+
use on other platforms, simply delete both AppleJavaExtensions.jar and
|
13 |
+
the file src/edu/stanford/nlp/trees/tregex/gui/OSXAdapter.java . The
|
14 |
+
OSXAdapter class is loaded using reflection by the main TregexGUI
|
15 |
+
class, so its absence will not cause any errors in compilation.
|
16 |
+
|
17 |
+
The file README-AppleJavaExtensions.txt contains Apple's README and
|
18 |
+
license information for AppleJavaExtensions.jar . More information on
|
19 |
+
AppleJavaExtensions can be found at:
|
20 |
+
|
21 |
+
http://developer.apple.com/samplecode/AppleJavaExtensions/
|
22 |
+
|
23 |
+
This issue of needing to include AppleJavaExtensions.jar occurs for
|
24 |
+
many Java GUI programs which want to function well on Mac OS X,
|
25 |
+
including NetBeans, FindBugs, etc. Do a Google search on:
|
26 |
+
|
27 |
+
AppleJavaExtensions license
|
28 |
+
|
29 |
+
to find examples.
|
lng/L2SCA/lib/AppleJavaExtensions.jar
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f0f46aaca0deba5d07490f66d420a58e5a17e4fe8b5118a3ae831207d953f52b
|
3 |
+
size 4189
|
lng/L2SCA/lib/README-AppleJavaExtensions.txt
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AppleJavaExtensions
|
2 |
+
v 1.2
|
3 |
+
|
4 |
+
This is a pluggable jar of stub classes representing the new Apple eAWT and eIO APIs for Java 1.4 on Mac OS X. The purpose of these stubs is to allow for compilation of eAWT- or eIO-referencing code on platforms other than Mac OS X. The jar file is enclosed in a zip archive for easy expansion on other platforms.
|
5 |
+
|
6 |
+
These stubs are not intended for the runtime classpath on non-Mac platforms. Please see the OSXAdapter sample for how to write cross-platform code that uses eAWT.
|
7 |
+
|
8 |
+
Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple
|
9 |
+
Computer, Inc. ("Apple") in consideration of your agreement to the
|
10 |
+
following terms, and your use, installation, modification or
|
11 |
+
redistribution of this Apple software constitutes acceptance of these
|
12 |
+
terms. If you do not agree with these terms, please do not use,
|
13 |
+
install, modify or redistribute this Apple software.
|
14 |
+
|
15 |
+
In consideration of your agreement to abide by the following terms, and
|
16 |
+
subject to these terms, Apple grants you a personal, non-exclusive
|
17 |
+
license, under Apple's copyrights in this original Apple software (the
|
18 |
+
"Apple Software"), to use, reproduce, modify and redistribute the Apple
|
19 |
+
Software, with or without modifications, in source and/or binary forms;
|
20 |
+
provided that if you redistribute the Apple Software in its entirety and
|
21 |
+
without modifications, you must retain this notice and the following
|
22 |
+
text and disclaimers in all such redistributions of the Apple Software.
|
23 |
+
Neither the name, trademarks, service marks or logos of Apple Computer,
|
24 |
+
Inc. may be used to endorse or promote products derived from the Apple
|
25 |
+
Software without specific prior written permission from Apple. Except
|
26 |
+
as expressly stated in this notice, no other rights or licenses, express
|
27 |
+
or implied, are granted by Apple herein, including but not limited to
|
28 |
+
any patent rights that may be infringed by your derivative works or by
|
29 |
+
other works in which the Apple Software may be incorporated.
|
30 |
+
|
31 |
+
The Apple Software is provided by Apple on an "AS IS" basis. APPLE
|
32 |
+
MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
|
33 |
+
THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
|
34 |
+
FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
|
35 |
+
OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
|
36 |
+
|
37 |
+
IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
|
38 |
+
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
39 |
+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
40 |
+
INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
|
41 |
+
MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
|
42 |
+
AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
|
43 |
+
STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
|
44 |
+
POSSIBILITY OF SUCH DAMAGE.
|
45 |
+
|
46 |
+
Copyright © 2003-2006 Apple Computer, Inc., All Rights Reserved
|
lng/L2SCA/run-tregex-gui.bat
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
java -mx300m -cp "stanford-tregex.jar;" edu.stanford.nlp.trees.tregex.gui.TregexGUI
|
lng/L2SCA/run-tregex-gui.command
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
#!/bin/sh
|
2 |
+
java -mx300m -cp `dirname $0`/stanford-tregex.jar edu.stanford.nlp.trees.tregex.gui.TregexGUI
|
lng/L2SCA/samples/my_sample.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
A few people in a restaurant setting, one of them is drinking orange juice.
|
lng/L2SCA/samples/sample1.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Scores of properties are under extreme fire threat as a huge blaze
|
2 |
+
continues to advance through Sydney's north-western suburbs. Fires
|
3 |
+
have also shut down the major road and rail links between Sydney and
|
4 |
+
Gosford.
|
5 |
+
|
6 |
+
The promotional stop in Sydney was everything to be expected for a
|
7 |
+
Hollywood blockbuster - phalanxes of photographers, a stretch limo to
|
8 |
+
a hotel across the Quay - but with one difference. A line-up of
|
9 |
+
masseurs was waiting to take the media in hand. Never has the term
|
10 |
+
"massaging the media" seemed so accurate.
|
lng/L2SCA/samples/sample1_output
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C
|
2 |
+
sample1.txt,87,5,11,7,5,2,2,2,12,17.4000,17.4000,12.4286,1.4000,2.2000,1.4000,0.2857,0.4000,1.0000,0.4000,0.4000,0.2857,2.4000,1.7143
|
lng/L2SCA/samples/sample2.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Naaman was the captain of the host of King of Syria. He was a great and honourable man but he had a disease called Leprosy. The little maid that served Naaman's wife told her if her master, Naaman, was with the prophet in Samaria, he would be healed. So the King of Syria sent Naaman, along with a letter to the King of Israel. When the King of Israel received the letter, he tore his clothes. The King was upset because he knew that he could not heal Naaman.
|
lng/L2SCA/samples/samples_output
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C
|
2 |
+
sample1.txt,87,5,11,7,5,2,2,2,12,17.4000,17.4000,12.4286,1.4000,2.2000,1.4000,0.2857,0.4000,1.0000,0.4000,0.4000,0.2857,2.4000,1.7143
|
3 |
+
sample2.txt,90,6,13,13,7,5,3,1,13,15.0000,12.8571,6.9231,2.1667,1.8571,1.8571,0.3846,0.7143,1.1667,0.4286,0.1429,0.0769,1.8571,1.0000
|
lng/L2SCA/stanford-parser-full-2014-01-04/LICENSE.txt
ADDED
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
GNU GENERAL PUBLIC LICENSE
|
2 |
+
Version 2, June 1991
|
3 |
+
|
4 |
+
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
|
5 |
+
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
6 |
+
Everyone is permitted to copy and distribute verbatim copies
|
7 |
+
of this license document, but changing it is not allowed.
|
8 |
+
|
9 |
+
Preamble
|
10 |
+
|
11 |
+
The licenses for most software are designed to take away your
|
12 |
+
freedom to share and change it. By contrast, the GNU General Public
|
13 |
+
License is intended to guarantee your freedom to share and change free
|
14 |
+
software--to make sure the software is free for all its users. This
|
15 |
+
General Public License applies to most of the Free Software
|
16 |
+
Foundation's software and to any other program whose authors commit to
|
17 |
+
using it. (Some other Free Software Foundation software is covered by
|
18 |
+
the GNU Lesser General Public License instead.) You can apply it to
|
19 |
+
your programs, too.
|
20 |
+
|
21 |
+
When we speak of free software, we are referring to freedom, not
|
22 |
+
price. Our General Public Licenses are designed to make sure that you
|
23 |
+
have the freedom to distribute copies of free software (and charge for
|
24 |
+
this service if you wish), that you receive source code or can get it
|
25 |
+
if you want it, that you can change the software or use pieces of it
|
26 |
+
in new free programs; and that you know you can do these things.
|
27 |
+
|
28 |
+
To protect your rights, we need to make restrictions that forbid
|
29 |
+
anyone to deny you these rights or to ask you to surrender the rights.
|
30 |
+
These restrictions translate to certain responsibilities for you if you
|
31 |
+
distribute copies of the software, or if you modify it.
|
32 |
+
|
33 |
+
For example, if you distribute copies of such a program, whether
|
34 |
+
gratis or for a fee, you must give the recipients all the rights that
|
35 |
+
you have. You must make sure that they, too, receive or can get the
|
36 |
+
source code. And you must show them these terms so they know their
|
37 |
+
rights.
|
38 |
+
|
39 |
+
We protect your rights with two steps: (1) copyright the software, and
|
40 |
+
(2) offer you this license which gives you legal permission to copy,
|
41 |
+
distribute and/or modify the software.
|
42 |
+
|
43 |
+
Also, for each author's protection and ours, we want to make certain
|
44 |
+
that everyone understands that there is no warranty for this free
|
45 |
+
software. If the software is modified by someone else and passed on, we
|
46 |
+
want its recipients to know that what they have is not the original, so
|
47 |
+
that any problems introduced by others will not reflect on the original
|
48 |
+
authors' reputations.
|
49 |
+
|
50 |
+
Finally, any free program is threatened constantly by software
|
51 |
+
patents. We wish to avoid the danger that redistributors of a free
|
52 |
+
program will individually obtain patent licenses, in effect making the
|
53 |
+
program proprietary. To prevent this, we have made it clear that any
|
54 |
+
patent must be licensed for everyone's free use or not licensed at all.
|
55 |
+
|
56 |
+
The precise terms and conditions for copying, distribution and
|
57 |
+
modification follow.
|
58 |
+
|
59 |
+
GNU GENERAL PUBLIC LICENSE
|
60 |
+
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
61 |
+
|
62 |
+
0. This License applies to any program or other work which contains
|
63 |
+
a notice placed by the copyright holder saying it may be distributed
|
64 |
+
under the terms of this General Public License. The "Program", below,
|
65 |
+
refers to any such program or work, and a "work based on the Program"
|
66 |
+
means either the Program or any derivative work under copyright law:
|
67 |
+
that is to say, a work containing the Program or a portion of it,
|
68 |
+
either verbatim or with modifications and/or translated into another
|
69 |
+
language. (Hereinafter, translation is included without limitation in
|
70 |
+
the term "modification".) Each licensee is addressed as "you".
|
71 |
+
|
72 |
+
Activities other than copying, distribution and modification are not
|
73 |
+
covered by this License; they are outside its scope. The act of
|
74 |
+
running the Program is not restricted, and the output from the Program
|
75 |
+
is covered only if its contents constitute a work based on the
|
76 |
+
Program (independent of having been made by running the Program).
|
77 |
+
Whether that is true depends on what the Program does.
|
78 |
+
|
79 |
+
1. You may copy and distribute verbatim copies of the Program's
|
80 |
+
source code as you receive it, in any medium, provided that you
|
81 |
+
conspicuously and appropriately publish on each copy an appropriate
|
82 |
+
copyright notice and disclaimer of warranty; keep intact all the
|
83 |
+
notices that refer to this License and to the absence of any warranty;
|
84 |
+
and give any other recipients of the Program a copy of this License
|
85 |
+
along with the Program.
|
86 |
+
|
87 |
+
You may charge a fee for the physical act of transferring a copy, and
|
88 |
+
you may at your option offer warranty protection in exchange for a fee.
|
89 |
+
|
90 |
+
2. You may modify your copy or copies of the Program or any portion
|
91 |
+
of it, thus forming a work based on the Program, and copy and
|
92 |
+
distribute such modifications or work under the terms of Section 1
|
93 |
+
above, provided that you also meet all of these conditions:
|
94 |
+
|
95 |
+
a) You must cause the modified files to carry prominent notices
|
96 |
+
stating that you changed the files and the date of any change.
|
97 |
+
|
98 |
+
b) You must cause any work that you distribute or publish, that in
|
99 |
+
whole or in part contains or is derived from the Program or any
|
100 |
+
part thereof, to be licensed as a whole at no charge to all third
|
101 |
+
parties under the terms of this License.
|
102 |
+
|
103 |
+
c) If the modified program normally reads commands interactively
|
104 |
+
when run, you must cause it, when started running for such
|
105 |
+
interactive use in the most ordinary way, to print or display an
|
106 |
+
announcement including an appropriate copyright notice and a
|
107 |
+
notice that there is no warranty (or else, saying that you provide
|
108 |
+
a warranty) and that users may redistribute the program under
|
109 |
+
these conditions, and telling the user how to view a copy of this
|
110 |
+
License. (Exception: if the Program itself is interactive but
|
111 |
+
does not normally print such an announcement, your work based on
|
112 |
+
the Program is not required to print an announcement.)
|
113 |
+
|
114 |
+
These requirements apply to the modified work as a whole. If
|
115 |
+
identifiable sections of that work are not derived from the Program,
|
116 |
+
and can be reasonably considered independent and separate works in
|
117 |
+
themselves, then this License, and its terms, do not apply to those
|
118 |
+
sections when you distribute them as separate works. But when you
|
119 |
+
distribute the same sections as part of a whole which is a work based
|
120 |
+
on the Program, the distribution of the whole must be on the terms of
|
121 |
+
this License, whose permissions for other licensees extend to the
|
122 |
+
entire whole, and thus to each and every part regardless of who wrote it.
|
123 |
+
|
124 |
+
Thus, it is not the intent of this section to claim rights or contest
|
125 |
+
your rights to work written entirely by you; rather, the intent is to
|
126 |
+
exercise the right to control the distribution of derivative or
|
127 |
+
collective works based on the Program.
|
128 |
+
|
129 |
+
In addition, mere aggregation of another work not based on the Program
|
130 |
+
with the Program (or with a work based on the Program) on a volume of
|
131 |
+
a storage or distribution medium does not bring the other work under
|
132 |
+
the scope of this License.
|
133 |
+
|
134 |
+
3. You may copy and distribute the Program (or a work based on it,
|
135 |
+
under Section 2) in object code or executable form under the terms of
|
136 |
+
Sections 1 and 2 above provided that you also do one of the following:
|
137 |
+
|
138 |
+
a) Accompany it with the complete corresponding machine-readable
|
139 |
+
source code, which must be distributed under the terms of Sections
|
140 |
+
1 and 2 above on a medium customarily used for software interchange; or,
|
141 |
+
|
142 |
+
b) Accompany it with a written offer, valid for at least three
|
143 |
+
years, to give any third party, for a charge no more than your
|
144 |
+
cost of physically performing source distribution, a complete
|
145 |
+
machine-readable copy of the corresponding source code, to be
|
146 |
+
distributed under the terms of Sections 1 and 2 above on a medium
|
147 |
+
customarily used for software interchange; or,
|
148 |
+
|
149 |
+
c) Accompany it with the information you received as to the offer
|
150 |
+
to distribute corresponding source code. (This alternative is
|
151 |
+
allowed only for noncommercial distribution and only if you
|
152 |
+
received the program in object code or executable form with such
|
153 |
+
an offer, in accord with Subsection b above.)
|
154 |
+
|
155 |
+
The source code for a work means the preferred form of the work for
|
156 |
+
making modifications to it. For an executable work, complete source
|
157 |
+
code means all the source code for all modules it contains, plus any
|
158 |
+
associated interface definition files, plus the scripts used to
|
159 |
+
control compilation and installation of the executable. However, as a
|
160 |
+
special exception, the source code distributed need not include
|
161 |
+
anything that is normally distributed (in either source or binary
|
162 |
+
form) with the major components (compiler, kernel, and so on) of the
|
163 |
+
operating system on which the executable runs, unless that component
|
164 |
+
itself accompanies the executable.
|
165 |
+
|
166 |
+
If distribution of executable or object code is made by offering
|
167 |
+
access to copy from a designated place, then offering equivalent
|
168 |
+
access to copy the source code from the same place counts as
|
169 |
+
distribution of the source code, even though third parties are not
|
170 |
+
compelled to copy the source along with the object code.
|
171 |
+
|
172 |
+
4. You may not copy, modify, sublicense, or distribute the Program
|
173 |
+
except as expressly provided under this License. Any attempt
|
174 |
+
otherwise to copy, modify, sublicense or distribute the Program is
|
175 |
+
void, and will automatically terminate your rights under this License.
|
176 |
+
However, parties who have received copies, or rights, from you under
|
177 |
+
this License will not have their licenses terminated so long as such
|
178 |
+
parties remain in full compliance.
|
179 |
+
|
180 |
+
5. You are not required to accept this License, since you have not
|
181 |
+
signed it. However, nothing else grants you permission to modify or
|
182 |
+
distribute the Program or its derivative works. These actions are
|
183 |
+
prohibited by law if you do not accept this License. Therefore, by
|
184 |
+
modifying or distributing the Program (or any work based on the
|
185 |
+
Program), you indicate your acceptance of this License to do so, and
|
186 |
+
all its terms and conditions for copying, distributing or modifying
|
187 |
+
the Program or works based on it.
|
188 |
+
|
189 |
+
6. Each time you redistribute the Program (or any work based on the
|
190 |
+
Program), the recipient automatically receives a license from the
|
191 |
+
original licensor to copy, distribute or modify the Program subject to
|
192 |
+
these terms and conditions. You may not impose any further
|
193 |
+
restrictions on the recipients' exercise of the rights granted herein.
|
194 |
+
You are not responsible for enforcing compliance by third parties to
|
195 |
+
this License.
|
196 |
+
|
197 |
+
7. If, as a consequence of a court judgment or allegation of patent
|
198 |
+
infringement or for any other reason (not limited to patent issues),
|
199 |
+
conditions are imposed on you (whether by court order, agreement or
|
200 |
+
otherwise) that contradict the conditions of this License, they do not
|
201 |
+
excuse you from the conditions of this License. If you cannot
|
202 |
+
distribute so as to satisfy simultaneously your obligations under this
|
203 |
+
License and any other pertinent obligations, then as a consequence you
|
204 |
+
may not distribute the Program at all. For example, if a patent
|
205 |
+
license would not permit royalty-free redistribution of the Program by
|
206 |
+
all those who receive copies directly or indirectly through you, then
|
207 |
+
the only way you could satisfy both it and this License would be to
|
208 |
+
refrain entirely from distribution of the Program.
|
209 |
+
|
210 |
+
If any portion of this section is held invalid or unenforceable under
|
211 |
+
any particular circumstance, the balance of the section is intended to
|
212 |
+
apply and the section as a whole is intended to apply in other
|
213 |
+
circumstances.
|
214 |
+
|
215 |
+
It is not the purpose of this section to induce you to infringe any
|
216 |
+
patents or other property right claims or to contest validity of any
|
217 |
+
such claims; this section has the sole purpose of protecting the
|
218 |
+
integrity of the free software distribution system, which is
|
219 |
+
implemented by public license practices. Many people have made
|
220 |
+
generous contributions to the wide range of software distributed
|
221 |
+
through that system in reliance on consistent application of that
|
222 |
+
system; it is up to the author/donor to decide if he or she is willing
|
223 |
+
to distribute software through any other system and a licensee cannot
|
224 |
+
impose that choice.
|
225 |
+
|
226 |
+
This section is intended to make thoroughly clear what is believed to
|
227 |
+
be a consequence of the rest of this License.
|
228 |
+
|
229 |
+
8. If the distribution and/or use of the Program is restricted in
|
230 |
+
certain countries either by patents or by copyrighted interfaces, the
|
231 |
+
original copyright holder who places the Program under this License
|
232 |
+
may add an explicit geographical distribution limitation excluding
|
233 |
+
those countries, so that distribution is permitted only in or among
|
234 |
+
countries not thus excluded. In such case, this License incorporates
|
235 |
+
the limitation as if written in the body of this License.
|
236 |
+
|
237 |
+
9. The Free Software Foundation may publish revised and/or new versions
|
238 |
+
of the General Public License from time to time. Such new versions will
|
239 |
+
be similar in spirit to the present version, but may differ in detail to
|
240 |
+
address new problems or concerns.
|
241 |
+
|
242 |
+
Each version is given a distinguishing version number. If the Program
|
243 |
+
specifies a version number of this License which applies to it and "any
|
244 |
+
later version", you have the option of following the terms and conditions
|
245 |
+
either of that version or of any later version published by the Free
|
246 |
+
Software Foundation. If the Program does not specify a version number of
|
247 |
+
this License, you may choose any version ever published by the Free Software
|
248 |
+
Foundation.
|
249 |
+
|
250 |
+
10. If you wish to incorporate parts of the Program into other free
|
251 |
+
programs whose distribution conditions are different, write to the author
|
252 |
+
to ask for permission. For software which is copyrighted by the Free
|
253 |
+
Software Foundation, write to the Free Software Foundation; we sometimes
|
254 |
+
make exceptions for this. Our decision will be guided by the two goals
|
255 |
+
of preserving the free status of all derivatives of our free software and
|
256 |
+
of promoting the sharing and reuse of software generally.
|
257 |
+
|
258 |
+
NO WARRANTY
|
259 |
+
|
260 |
+
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
261 |
+
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
262 |
+
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
263 |
+
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
|
264 |
+
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
265 |
+
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
|
266 |
+
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
|
267 |
+
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
|
268 |
+
REPAIR OR CORRECTION.
|
269 |
+
|
270 |
+
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
271 |
+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
272 |
+
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
|
273 |
+
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
|
274 |
+
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
|
275 |
+
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
|
276 |
+
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
|
277 |
+
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
|
278 |
+
POSSIBILITY OF SUCH DAMAGES.
|
279 |
+
|
280 |
+
END OF TERMS AND CONDITIONS
|
281 |
+
|
282 |
+
How to Apply These Terms to Your New Programs
|
283 |
+
|
284 |
+
If you develop a new program, and you want it to be of the greatest
|
285 |
+
possible use to the public, the best way to achieve this is to make it
|
286 |
+
free software which everyone can redistribute and change under these terms.
|
287 |
+
|
288 |
+
To do so, attach the following notices to the program. It is safest
|
289 |
+
to attach them to the start of each source file to most effectively
|
290 |
+
convey the exclusion of warranty; and each file should have at least
|
291 |
+
the "copyright" line and a pointer to where the full notice is found.
|
292 |
+
|
293 |
+
<one line to give the program's name and a brief idea of what it does.>
|
294 |
+
Copyright (C) <year> <name of author>
|
295 |
+
|
296 |
+
This program is free software; you can redistribute it and/or modify
|
297 |
+
it under the terms of the GNU General Public License as published by
|
298 |
+
the Free Software Foundation; either version 2 of the License, or
|
299 |
+
(at your option) any later version.
|
300 |
+
|
301 |
+
This program is distributed in the hope that it will be useful,
|
302 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
303 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
304 |
+
GNU General Public License for more details.
|
305 |
+
|
306 |
+
You should have received a copy of the GNU General Public License along
|
307 |
+
with this program; if not, write to the Free Software Foundation, Inc.,
|
308 |
+
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
309 |
+
|
310 |
+
Also add information on how to contact you by electronic and paper mail.
|
311 |
+
|
312 |
+
If the program is interactive, make it output a short notice like this
|
313 |
+
when it starts in an interactive mode:
|
314 |
+
|
315 |
+
Gnomovision version 69, Copyright (C) year name of author
|
316 |
+
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
317 |
+
This is free software, and you are welcome to redistribute it
|
318 |
+
under certain conditions; type `show c' for details.
|
319 |
+
|
320 |
+
The hypothetical commands `show w' and `show c' should show the appropriate
|
321 |
+
parts of the General Public License. Of course, the commands you use may
|
322 |
+
be called something other than `show w' and `show c'; they could even be
|
323 |
+
mouse-clicks or menu items--whatever suits your program.
|
324 |
+
|
325 |
+
You should also get your employer (if you work as a programmer) or your
|
326 |
+
school, if any, to sign a "copyright disclaimer" for the program, if
|
327 |
+
necessary. Here is a sample; alter the names:
|
328 |
+
|
329 |
+
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
|
330 |
+
`Gnomovision' (which makes passes at compilers) written by James Hacker.
|
331 |
+
|
332 |
+
<signature of Ty Coon>, 1 April 1989
|
333 |
+
Ty Coon, President of Vice
|
334 |
+
|
335 |
+
This General Public License does not permit incorporating your program into
|
336 |
+
proprietary programs. If your program is a subroutine library, you may
|
337 |
+
consider it more useful to permit linking proprietary applications with the
|
338 |
+
library. If this is what you want to do, use the GNU Lesser General
|
339 |
+
Public License instead of this License.
|
lng/L2SCA/stanford-parser-full-2014-01-04/Makefile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is a rudimentary Makefile for rebuilding the parser.
|
2 |
+
# We actually use ant (q.v.) or a Java IDE.
|
3 |
+
|
4 |
+
JAVAC = javac
|
5 |
+
JAVAFLAGS = -O -d classes -encoding utf-8
|
6 |
+
|
7 |
+
parser:
|
8 |
+
mkdir -p classes
|
9 |
+
$(JAVAC) $(JAVAFLAGS) src/edu/stanford/nlp/*/*.java \
|
10 |
+
src/edu/stanford/nlp/*/*/*.java src/edu/stanford/nlp/*/*/*/*.java
|
11 |
+
cd classes ; jar -cfm ../stanford-parser-`date +%Y-%m-%d`.jar ../src/edu/stanford/nlp/parser/lexparser/lexparser-manifest.txt edu ; cd ..
|
12 |
+
cp stanford-parser-`date +%Y-%m-%d`.jar stanford-parser.jar
|
13 |
+
rm -rf classes
|
lng/L2SCA/stanford-parser-full-2014-01-04/ParserDemo.java
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import java.util.Collection;
|
3 |
+
import java.util.List;
|
4 |
+
import java.io.StringReader;
|
5 |
+
|
6 |
+
import edu.stanford.nlp.process.Tokenizer;
|
7 |
+
import edu.stanford.nlp.process.TokenizerFactory;
|
8 |
+
import edu.stanford.nlp.process.CoreLabelTokenFactory;
|
9 |
+
import edu.stanford.nlp.process.DocumentPreprocessor;
|
10 |
+
import edu.stanford.nlp.process.PTBTokenizer;
|
11 |
+
import edu.stanford.nlp.ling.CoreLabel;
|
12 |
+
import edu.stanford.nlp.ling.HasWord;
|
13 |
+
import edu.stanford.nlp.ling.Sentence;
|
14 |
+
import edu.stanford.nlp.trees.*;
|
15 |
+
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
|
16 |
+
|
17 |
+
class ParserDemo {
|
18 |
+
|
19 |
+
/**
|
20 |
+
* The main method demonstrates the easiest way to load a parser.
|
21 |
+
* Simply call loadModel and specify the path of a serialized grammar
|
22 |
+
* model, which can be a file, a resource on the classpath, or even a URL.
|
23 |
+
* For example, this demonstrates loading from the models jar file, which
|
24 |
+
* you therefore need to include in the classpath for ParserDemo to work.
|
25 |
+
*/
|
26 |
+
public static void main(String[] args) {
|
27 |
+
LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
|
28 |
+
if (args.length > 0) {
|
29 |
+
demoDP(lp, args[0]);
|
30 |
+
} else {
|
31 |
+
demoAPI(lp);
|
32 |
+
}
|
33 |
+
}
|
34 |
+
|
35 |
+
/**
|
36 |
+
* demoDP demonstrates turning a file into tokens and then parse
|
37 |
+
* trees. Note that the trees are printed by calling pennPrint on
|
38 |
+
* the Tree object. It is also possible to pass a PrintWriter to
|
39 |
+
* pennPrint if you want to capture the output.
|
40 |
+
*/
|
41 |
+
public static void demoDP(LexicalizedParser lp, String filename) {
|
42 |
+
// This option shows loading, sentence-segmenting and tokenizing
|
43 |
+
// a file using DocumentPreprocessor.
|
44 |
+
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
|
45 |
+
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
|
46 |
+
// You could also create a tokenizer here (as below) and pass it
|
47 |
+
// to DocumentPreprocessor
|
48 |
+
for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
|
49 |
+
Tree parse = lp.apply(sentence);
|
50 |
+
parse.pennPrint();
|
51 |
+
System.out.println();
|
52 |
+
|
53 |
+
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
|
54 |
+
Collection tdl = gs.typedDependenciesCCprocessed();
|
55 |
+
System.out.println(tdl);
|
56 |
+
System.out.println();
|
57 |
+
}
|
58 |
+
}
|
59 |
+
|
60 |
+
/**
|
61 |
+
* demoAPI demonstrates other ways of calling the parser with
|
62 |
+
* already tokenized text, or in some cases, raw text that needs to
|
63 |
+
* be tokenized as a single sentence. Output is handled with a
|
64 |
+
* TreePrint object. Note that the options used when creating the
|
65 |
+
* TreePrint can determine what results to print out. Once again,
|
66 |
+
* one can capture the output by passing a PrintWriter to
|
67 |
+
* TreePrint.printTree.
|
68 |
+
*/
|
69 |
+
public static void demoAPI(LexicalizedParser lp) {
|
70 |
+
// This option shows parsing a list of correctly tokenized words
|
71 |
+
String[] sent = { "This", "is", "an", "easy", "sentence", "." };
|
72 |
+
List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
|
73 |
+
Tree parse = lp.apply(rawWords);
|
74 |
+
parse.pennPrint();
|
75 |
+
System.out.println();
|
76 |
+
|
77 |
+
// This option shows loading and using an explicit tokenizer
|
78 |
+
String sent2 = "This is another sentence.";
|
79 |
+
TokenizerFactory<CoreLabel> tokenizerFactory =
|
80 |
+
PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
|
81 |
+
Tokenizer<CoreLabel> tok =
|
82 |
+
tokenizerFactory.getTokenizer(new StringReader(sent2));
|
83 |
+
List<CoreLabel> rawWords2 = tok.tokenize();
|
84 |
+
parse = lp.apply(rawWords2);
|
85 |
+
|
86 |
+
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
|
87 |
+
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
|
88 |
+
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
|
89 |
+
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
|
90 |
+
System.out.println(tdl);
|
91 |
+
System.out.println();
|
92 |
+
|
93 |
+
// You can also use a TreePrint object to print trees and dependencies
|
94 |
+
TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
|
95 |
+
tp.printTree(parse);
|
96 |
+
}
|
97 |
+
|
98 |
+
private ParserDemo() {} // static methods only
|
99 |
+
|
100 |
+
}
|
lng/L2SCA/stanford-parser-full-2014-01-04/ParserDemo2.java
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import java.io.IOException;
|
3 |
+
import java.io.StringReader;
|
4 |
+
import java.util.*;
|
5 |
+
|
6 |
+
import edu.stanford.nlp.ling.CoreLabel;
|
7 |
+
import edu.stanford.nlp.ling.HasWord;
|
8 |
+
import edu.stanford.nlp.ling.Label;
|
9 |
+
import edu.stanford.nlp.ling.Word;
|
10 |
+
import edu.stanford.nlp.process.DocumentPreprocessor;
|
11 |
+
import edu.stanford.nlp.process.Tokenizer;
|
12 |
+
import edu.stanford.nlp.trees.*;
|
13 |
+
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
|
14 |
+
|
15 |
+
class ParserDemo2 {
|
16 |
+
|
17 |
+
/** This example shows a few more ways of providing input to a parser.
|
18 |
+
*
|
19 |
+
* Usage: ParserDemo2 [grammar [textFile]]
|
20 |
+
*/
|
21 |
+
public static void main(String[] args) throws IOException {
|
22 |
+
String grammar = args.length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
|
23 |
+
String[] options = { "-maxLength", "80", "-retainTmpSubcategories" };
|
24 |
+
LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
|
25 |
+
TreebankLanguagePack tlp = lp.getOp().langpack();
|
26 |
+
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
|
27 |
+
|
28 |
+
Iterable<List<? extends HasWord>> sentences;
|
29 |
+
if (args.length > 1) {
|
30 |
+
DocumentPreprocessor dp = new DocumentPreprocessor(args[1]);
|
31 |
+
List<List<? extends HasWord>> tmp =
|
32 |
+
new ArrayList<List<? extends HasWord>>();
|
33 |
+
for (List<HasWord> sentence : dp) {
|
34 |
+
tmp.add(sentence);
|
35 |
+
}
|
36 |
+
sentences = tmp;
|
37 |
+
} else {
|
38 |
+
// Showing tokenization and parsing in code a couple of different ways.
|
39 |
+
String[] sent = { "This", "is", "an", "easy", "sentence", "." };
|
40 |
+
List<HasWord> sentence = new ArrayList<HasWord>();
|
41 |
+
for (String word : sent) {
|
42 |
+
sentence.add(new Word(word));
|
43 |
+
}
|
44 |
+
String sent2 = ("This is a slightly longer and more complex " +
|
45 |
+
"sentence requiring tokenization.");
|
46 |
+
// Use the default tokenizer for this TreebankLanguagePack
|
47 |
+
Tokenizer<? extends HasWord> toke =
|
48 |
+
tlp.getTokenizerFactory().getTokenizer(new StringReader(sent2));
|
49 |
+
List<? extends HasWord> sentence2 = toke.tokenize();
|
50 |
+
List<List<? extends HasWord>> tmp =
|
51 |
+
new ArrayList<List<? extends HasWord>>();
|
52 |
+
tmp.add(sentence);
|
53 |
+
tmp.add(sentence2);
|
54 |
+
sentences = tmp;
|
55 |
+
}
|
56 |
+
|
57 |
+
for (List<? extends HasWord> sentence : sentences) {
|
58 |
+
Tree parse = lp.parse(sentence);
|
59 |
+
parse.pennPrint();
|
60 |
+
System.out.println();
|
61 |
+
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
|
62 |
+
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
|
63 |
+
System.out.println(tdl);
|
64 |
+
System.out.println();
|
65 |
+
|
66 |
+
System.out.println("The words of the sentence:");
|
67 |
+
for (Label lab : parse.yield()) {
|
68 |
+
if (lab instanceof CoreLabel) {
|
69 |
+
System.out.println(((CoreLabel) lab).toString("{map}"));
|
70 |
+
} else {
|
71 |
+
System.out.println(lab);
|
72 |
+
}
|
73 |
+
}
|
74 |
+
System.out.println();
|
75 |
+
System.out.println(parse.taggedYield());
|
76 |
+
System.out.println();
|
77 |
+
|
78 |
+
}
|
79 |
+
|
80 |
+
// This method turns the String into a single sentence using the
|
81 |
+
// default tokenizer for the TreebankLanguagePack.
|
82 |
+
String sent3 = "This is one last test!";
|
83 |
+
lp.parse(sent3).pennPrint();
|
84 |
+
}
|
85 |
+
|
86 |
+
private ParserDemo2() {} // static methods only
|
87 |
+
|
88 |
+
}
|
lng/L2SCA/stanford-parser-full-2014-01-04/README.txt
ADDED
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Stanford Lexicalized Parser v3.3.1 - 2014-01-04
|
2 |
+
-----------------------------------------------
|
3 |
+
|
4 |
+
Copyright (c) 2002-2012 The Board of Trustees of The Leland Stanford Junior
|
5 |
+
University. All Rights Reserved.
|
6 |
+
|
7 |
+
Original core parser code by Dan Klein. Support code, additional
|
8 |
+
modules, languages, features, internationalization, compaction, typed
|
9 |
+
dependencies, etc. by Christopher Manning, Roger Levy, Teg Grenager,
|
10 |
+
Galen Andrew, Marie-Catherine de Marneffe, Jenny Finkel, Spence Green,
|
11 |
+
Bill MacCartney, Anna Rafferty, Huihsin Tseng, Pi-Chuan Chang,
|
12 |
+
Wolfgang Maier, Richard Eckart, Richard Socher, and John Bauer.
|
13 |
+
|
14 |
+
This release prepared by John Bauer.
|
15 |
+
|
16 |
+
This package contains 4 parsers: a high-accuracy unlexicalized PCFG; a
|
17 |
+
lexicalized dependency parser; a factored model, where the estimates
|
18 |
+
of dependencies and an unlexicalized PCFG are jointly optimized to
|
19 |
+
give a lexicalized PCFG treebank parser; and an RNN parser, where
|
20 |
+
recursive neural networks trained with semantic word vectors are used
|
21 |
+
to score parse trees. Also included are grammars for various
|
22 |
+
languages for use with these parsers.
|
23 |
+
|
24 |
+
For more information about the parser API, point a web browser at the
|
25 |
+
included javadoc directory (use the browser's Open File command to open
|
26 |
+
the index.html file inside the javadoc folder). Start by looking at the
|
27 |
+
Package page for the edu.stanford.nlp.parser.lexparser package, and then
|
28 |
+
look at the page for the LexicalizedParser class documentation therein,
|
29 |
+
particularly documentation of the main method.
|
30 |
+
|
31 |
+
Secondly, you should also look at the Parser FAQ on the web:
|
32 |
+
|
33 |
+
http://nlp.stanford.edu/software/parser-faq.shtml
|
34 |
+
|
35 |
+
This software requires Java 6 (JDK 1.6.0+). (You must have installed it
|
36 |
+
separately. Check that the command "java -version" works and gives 1.6+.)
|
37 |
+
|
38 |
+
|
39 |
+
QUICKSTART
|
40 |
+
|
41 |
+
UNIX COMMAND-LINE USAGE
|
42 |
+
|
43 |
+
On a Unix system you should be able to parse the English test file with the
|
44 |
+
following command:
|
45 |
+
|
46 |
+
./lexparser.sh data/testsent.txt
|
47 |
+
|
48 |
+
This uses the PCFG parser, which is quick to load and run, and quite accurate.
|
49 |
+
|
50 |
+
[Notes: it takes a few seconds to load the parser data before parsing
|
51 |
+
begins; continued parsing is quicker. To use the lexicalized parser, replace
|
52 |
+
englishPCFG.ser.gz with englishFactored.ser.gz in the lexparser.sh script
|
53 |
+
and use the flag -mx600m to give more memory to java.]
|
54 |
+
|
55 |
+
WINDOWS GUI USAGE
|
56 |
+
|
57 |
+
On a Windows system, assuming that java is on your PATH, you should be able
|
58 |
+
to run a parsing GUI by double-clicking on the lexparser-gui.bat icon,
|
59 |
+
or giving the command lexparser-gui in this directory from a command prompt.
|
60 |
+
|
61 |
+
Click Load File, Browse, and navigate to and select testsent.txt in
|
62 |
+
the top directory of the parser distribution. Click Load Parser,
|
63 |
+
Browse, and select the models jar, also in the top directory of the
|
64 |
+
parser distribution. From the models jar, select englishPCFG.ser.gz.
|
65 |
+
Click Parse to parse the first sentence.
|
66 |
+
|
67 |
+
OTHER USE CASES
|
68 |
+
|
69 |
+
The GUI is also available under Unix:
|
70 |
+
|
71 |
+
lexparser-gui.sh
|
72 |
+
|
73 |
+
Under Mac OS X, you can double-click on lexparser-gui.command to invoke the
|
74 |
+
GUI. The command-line version works on all platforms. Use lexparser.bat
|
75 |
+
to run it under Windows. The GUI is only for exploring the parser. It does
|
76 |
+
not allow you to save output. You need to use the command-line program or
|
77 |
+
programmatic API to do serious work with the parser.
|
78 |
+
|
79 |
+
ADDITIONAL GRAMMARS
|
80 |
+
|
81 |
+
The parser is supplied with several trained grammars. There are English
|
82 |
+
grammars based on the standard LDC Penn Treebank WSJ training sections 2-21
|
83 |
+
(wsj*), and ones based on an augmented data set, better for questions,
|
84 |
+
commands, and recent English and biomedical text (english*).
|
85 |
+
|
86 |
+
All grammars are located in the included models jar. (If you'd like to have
|
87 |
+
grammar files like in older versions of the parser, you can get them by
|
88 |
+
extracting them from the jar file with the 'jar -xf' command.)
|
89 |
+
|
90 |
+
MULTILINGUAL PARSING
|
91 |
+
In addition to the English grammars, the parser comes with trained grammars
|
92 |
+
for Arabic, Chinese, French, and German. To parse with these grammars, run
|
93 |
+
|
94 |
+
lexparser-lang.sh
|
95 |
+
|
96 |
+
with no arguments to see usage instructions. You can change language-specific
|
97 |
+
settings passed to the parser by modifying lexparser_lang.def.
|
98 |
+
|
99 |
+
You can also train and evaluate new grammars using:
|
100 |
+
|
101 |
+
lexparser-lang-train-test.sh
|
102 |
+
|
103 |
+
To see how we trained the grammars supplied in this distribution, see
|
104 |
+
|
105 |
+
bin/makeSerialized.csh
|
106 |
+
|
107 |
+
You will not be able to run this script (since it uses Stanford-specific file
|
108 |
+
paths), but you should be able to see what we did.
|
109 |
+
|
110 |
+
Arabic
|
111 |
+
Trained on parts 1-3 of the Penn Arabic Treebank (ATB) using the
|
112 |
+
pre-processing described in (Green and Manning, 2010). The default input
|
113 |
+
encoding is UTF-8 Arabic script. You can convert text in Buckwalter encoding to UTF-8
|
114 |
+
with the package edu.stanford.nlp.international.arabic.Buckwalter which is included
|
115 |
+
in stanford-parser.jar.
|
116 |
+
|
117 |
+
The parser *requires* segmentation and tokenization of raw text per the ATB standard
|
118 |
+
prior to parsing. You can generate this segmentation and tokenization with the Stanford
|
119 |
+
Word Segmenter, which is available separately at:
|
120 |
+
|
121 |
+
http://nlp.stanford.edu/software/segmenter.shtml
|
122 |
+
|
123 |
+
Chinese
|
124 |
+
There are Chinese grammars trained just on mainland material from
|
125 |
+
Xinhua and more mixed material from the LDC Chinese Treebank. The default
|
126 |
+
input encoding is GB18030.
|
127 |
+
|
128 |
+
French
|
129 |
+
Trained on the functionally annotated section of the French Treebank
|
130 |
+
(FTB) using the pre-processing described in (Green et al., 2011). For raw text input,
|
131 |
+
a tokenizer is enabled by default that produces FTB tokenization. To disable this
|
132 |
+
tokenizer, use the "-tokenized" option. To tokenize raw text separately, see
|
133 |
+
the usage information in edu.stanford.nlp.international.french.process.FrenchTokenizer.
|
134 |
+
|
135 |
+
German
|
136 |
+
Trained on the Negra corpus. Details are included in (Rafferty and
|
137 |
+
Manning, 2008).
|
138 |
+
|
139 |
+
TREEBANK PREPROCESSING
|
140 |
+
|
141 |
+
The pre-processed versions of the ATB described
|
142 |
+
in (Green and Manning, 2010) and the FTB described in (Green et al.,
|
143 |
+
2011) can be reproduced using the TreebankPreprocessor included in this
|
144 |
+
release. The configuration files are located in /conf. For example,
|
145 |
+
to create the ATB data, run:
|
146 |
+
|
147 |
+
bin/run-tb-preproc -v conf/atb-latest.conf
|
148 |
+
|
149 |
+
Note that you'll need to update the conf file paths to your local treebank
|
150 |
+
distributions as the data is not distributed with the parser. You'll
|
151 |
+
also need to set the classpath in the cmd_line variable of run-tb-preproc.
|
152 |
+
|
153 |
+
The TreebankPreprocessor conf files support various options, which are
|
154 |
+
documented in
|
155 |
+
|
156 |
+
edu.stanford.nlp.international.process.ConfigParser
|
157 |
+
|
158 |
+
EVALUATION METRICS
|
159 |
+
|
160 |
+
The Stanford parser comes with Java implementations of the following
|
161 |
+
evaluation metrics:
|
162 |
+
|
163 |
+
Dependency Labeled Attachment
|
164 |
+
|
165 |
+
Evalb (Collins, 1997)
|
166 |
+
-Includes per-category evaluation with the -c option
|
167 |
+
|
168 |
+
Leaf Ancestor (Sampson and Babarczy, 2003)
|
169 |
+
-Both micro- and macro-averaged score
|
170 |
+
|
171 |
+
Tagging Accuracy
|
172 |
+
|
173 |
+
See the usage instructions and javadocs in the requisite classes located in
|
174 |
+
edu.stanford.nlp.parser.metrics.
|
175 |
+
|
176 |
+
LICENSE
|
177 |
+
|
178 |
+
// StanfordLexicalizedParser -- a probabilistic lexicalized NL CFG parser
|
179 |
+
// Copyright (c) 2002-2012 The Board of Trustees of
|
180 |
+
// The Leland Stanford Junior University. All Rights Reserved.
|
181 |
+
//
|
182 |
+
// This program is free software; you can redistribute it and/or
|
183 |
+
// modify it under the terms of the GNU General Public License
|
184 |
+
// as published by the Free Software Foundation; either version 2
|
185 |
+
// of the License, or (at your option) any later version.
|
186 |
+
//
|
187 |
+
// This program is distributed in the hope that it will be useful,
|
188 |
+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
189 |
+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
190 |
+
// GNU General Public License for more details.
|
191 |
+
//
|
192 |
+
// You should have received a copy of the GNU General Public License
|
193 |
+
// along with this program; if not, write to the Free Software
|
194 |
+
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
195 |
+
//
|
196 |
+
// For more information, bug reports, fixes, contact:
|
197 |
+
// Christopher Manning
|
198 |
+
// Dept of Computer Science, Gates 1A
|
199 |
+
// Stanford CA 94305-9010
|
200 |
+
// USA
|
201 |
+
// [email protected]
|
202 |
+
// http://nlp.stanford.edu/downloads/lex-parser.shtml
|
203 |
+
|
204 |
+
|
205 |
+
---------------------------------
|
206 |
+
CHANGES
|
207 |
+
---------------------------------
|
208 |
+
|
209 |
+
2014-01-04 3.3.1 Bugfix release, dependency improvements
|
210 |
+
|
211 |
+
2013-11-12 3.3.0 Remove the attr dependency, add imperatives to
|
212 |
+
English training data
|
213 |
+
|
214 |
+
2013-06-19 3.2.0 New RNN model for WSJ and English with
|
215 |
+
improved test set accuracy, rel dependency
|
216 |
+
removed
|
217 |
+
|
218 |
+
2013-04-05 2.0.5 Dependency improvements, ctb7 model, -nthreads
|
219 |
+
option
|
220 |
+
|
221 |
+
2012-11-12 2.0.4 Dependency speed improvements; other
|
222 |
+
dependency changes
|
223 |
+
|
224 |
+
2012-07-09 2.0.3 Minor bug fixes
|
225 |
+
|
226 |
+
2012-05-22 2.0.2 Supports adding extra data in non-tree format
|
227 |
+
|
228 |
+
2012-03-09 2.0.1 Caseless English model added, ready for maven
|
229 |
+
|
230 |
+
2012-01-11 2.0.0 Threadsafe!
|
231 |
+
|
232 |
+
2011-09-14 1.6.9 Added some imperatives to the English
|
233 |
+
training data; added root dependency.
|
234 |
+
|
235 |
+
2011-06-15 1.6.8 Added French parser and leaf ancestor
|
236 |
+
evaluation metric; reorganized distribution;
|
237 |
+
new data preparation scripts; rebuilt grammar
|
238 |
+
models; other bug fixes
|
239 |
+
|
240 |
+
2011-05-15 1.6.7 Minor bug fixes
|
241 |
+
|
242 |
+
2011-04-17 1.6.6 Compatible with tagger, corenlp and tregex.
|
243 |
+
|
244 |
+
2010-10-30 1.6.5 Further improvements to English Stanford
|
245 |
+
Dependencies and other minor changes
|
246 |
+
|
247 |
+
2010-08-16 1.6.4 More minor bug fixes and improvements to English
|
248 |
+
Stanford Dependencies and question parsing
|
249 |
+
|
250 |
+
2010-07-09 1.6.3 Improvements to English Stanford Dependencies and
|
251 |
+
question parsing, minor bug fixes
|
252 |
+
|
253 |
+
2010-02-25 1.6.2 Improvements to Arabic parser models,
|
254 |
+
and to English and Chinese Stanford Dependencies
|
255 |
+
|
256 |
+
2008-10-19 1.6.1 Slightly improved Arabic, German and
|
257 |
+
Stanford Dependencies
|
258 |
+
|
259 |
+
2007-08-18 1.6 Added Arabic, k-best PCCFG parsing;
|
260 |
+
improved English grammatical relations
|
261 |
+
|
262 |
+
2006-05-30 1.5.1 Improved English and Chinese grammatical relations;
|
263 |
+
fixed UTF-8 handling
|
264 |
+
|
265 |
+
2005-07-20 1.5 Added grammatical relations output;
|
266 |
+
fixed bugs introduced in 1.4
|
267 |
+
|
268 |
+
2004-03-24 1.4 Made PCFG faster again (by FSA minimization);
|
269 |
+
added German support
|
270 |
+
|
271 |
+
2003-09-06 1.3 Made parser over twice as fast;
|
272 |
+
added tokenization options
|
273 |
+
|
274 |
+
2003-07-20 1.2 Halved PCFG memory usage;
|
275 |
+
added support for Chinese
|
276 |
+
|
277 |
+
2003-03-25 1.1 Improved parsing speed; included GUI,
|
278 |
+
improved PCFG grammar
|
279 |
+
|
280 |
+
2002-12-05 1.0 Initial release
|
lng/L2SCA/stanford-parser-full-2014-01-04/README_dependencies.txt
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
STANFORD DEPENDENCIES. Stanford Parser v3.3.1
|
2 |
+
-----------------------------------------------------------
|
3 |
+
|
4 |
+
The manual for the English version of the Stanford Dependencies
|
5 |
+
representation:
|
6 |
+
|
7 |
+
StanfordDependenciesManual.pdf
|
8 |
+
|
9 |
+
should be consulted for the current set of dependency representations
|
10 |
+
and the correct commands for generating Stanford Dependencies together
|
11 |
+
with any of the Stanford Parser, another parser, or a treebank.
|
12 |
+
|
13 |
+
A typed dependencies representation is also available for Chinese. For
|
14 |
+
the moment the documentation consists of the code, and a brief
|
15 |
+
presentation in this paper:
|
16 |
+
|
17 |
+
Pi-Chuan Chang, Huihsin Tseng, Dan Jurafsky, and Christopher
|
18 |
+
D. Manning. 2009. Discriminative Reordering with Chinese Grammatical
|
19 |
+
Relations Features. Third Workshop on Syntax and Structure in Statistical
|
20 |
+
Translation.
|
21 |
+
|
22 |
+
|
23 |
+
--------------------------------------
|
24 |
+
ORIGINAL DEPENDENCIES SCHEME
|
25 |
+
|
26 |
+
For an overview of the original typed dependencies scheme, please look
|
27 |
+
at:
|
28 |
+
|
29 |
+
Marie-Catherine de Marneffe, Bill MacCartney, and Christopher D.
|
30 |
+
Manning. 2006. Generating Typed Dependency Parses from Phrase
|
31 |
+
Structure Parses. 5th International Conference on Language Resources
|
32 |
+
and Evaluation (LREC 2006).
|
33 |
+
http://nlp.stanford.edu/~manning/papers/LREC_2.pdf
|
34 |
+
|
35 |
+
For more discussion of the design principles, please see:
|
36 |
+
|
37 |
+
Marie-Catherine de Marneffe and Christopher D. Manning. 2008. The
|
38 |
+
Stanford typed dependencies representation. In Proceedings of the
|
39 |
+
workshop on Cross-Framework and Cross-Domain Parser Evaluation, pp. 1-8.
|
40 |
+
http://nlp.stanford.edu/~manning/papers/dependencies-coling08.pdf
|
41 |
+
|
42 |
+
These papers can be cited as references for the English Stanford
|
43 |
+
Dependencies.
|
44 |
+
|
45 |
+
|
46 |
+
--------------------------------------
|
47 |
+
CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- v3.3.1
|
48 |
+
|
49 |
+
A couple of fixes/improvements were made in the dependency conversion,
|
50 |
+
and one change was made to the taxonomy of relations.
|
51 |
+
|
52 |
+
- The partmod and infmod relations were deleted, and replaced with
|
53 |
+
vmod for reduced, non-finite verbal modifiers. The distinction between
|
54 |
+
these two relations can be recovered from the POS tag of the dependent.
|
55 |
+
- A couple of improvements were made to the conversion, the largest
|
56 |
+
one being recognizing pobj inside a PP not headed by something tagged
|
57 |
+
as IN or TO.
|
58 |
+
|
59 |
+
|
60 |
+
--------------------------------------
|
61 |
+
CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- v3.3
|
62 |
+
|
63 |
+
Some fixes/improvements were made in the dependency conversion, and one
|
64 |
+
change was made to the taxonomy of relations.
|
65 |
+
|
66 |
+
- For currency amount expressions with a currency symbol like "$", it
|
67 |
+
had previously been the case that "$" was the head, and then each
|
68 |
+
number word modified it as a number. We realized that this was
|
69 |
+
unnecessarily inconsistent. For the expression "two thousand dollars",
|
70 |
+
"dollars" is the head, but "thousand" is a num modifier of it, and
|
71 |
+
number is used for the parts of a number multi-word expression only.
|
72 |
+
This analysis is now also used for cases with a currency symbol. E.g.,
|
73 |
+
"for $ 52.7 million": prep(for, $) num($, million) number(million, 52.7).
|
74 |
+
Similarly, for "the $ 2.29 billion value", we changed the analysis from
|
75 |
+
num(value, $) number($, billion) to amod(value, $) num($, billion).
|
76 |
+
This corresponds to hwat you got for "a two dollar value".
|
77 |
+
This is actually the most common change (at least on WSJ newswire!).
|
78 |
+
- Remove the attr relation. Some cases disappear by making the question
|
79 |
+
phrase of WHNP be NP questions the root. Others (predicative NP
|
80 |
+
complements) become xcomp.
|
81 |
+
- Less aggressive labeling of participial form VPs as xcomp. More of them
|
82 |
+
are correctly labeled partmod (but occasionally a true xcomp is also
|
83 |
+
mislabeled as partmod).
|
84 |
+
- Small rule changes to recognize a few more ccomp and parataxis.
|
85 |
+
|
86 |
+
|
87 |
+
--------------------------------------
|
88 |
+
CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- v3.2, JUNE 2013
|
89 |
+
|
90 |
+
Various small fixes were made to the dependencies conversion,
|
91 |
+
and one change to the taxonomy of relations:
|
92 |
+
- rel was removed. rel was originally used as the relation for an
|
93 |
+
overt relativizer in a relative clause. But it was never a real
|
94 |
+
grammatical relation, and we gradually started labeling easy cases
|
95 |
+
as nsubj or dobj. In this release, rel is removed, pobj cases are
|
96 |
+
also labeled, and the remaining hard cases are labeled as dep.
|
97 |
+
|
98 |
+
--------------------------------------
|
99 |
+
CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- v2.0.5, MARCH 2013
|
100 |
+
|
101 |
+
We have begun a more major effort to improve the suitability and coverage of
|
102 |
+
Stanford Dependencies on less formal text types, and to clean up a couple of
|
103 |
+
the more quirky dependencies in the original set. These changes are still
|
104 |
+
ongoing, but in this first installment, we have removed 3 dependencies and
|
105 |
+
added 2:
|
106 |
+
- abbrev was removed, and is now viewed as just a case of appos.
|
107 |
+
- complm was removed, and is now viewed as just a case of mark.
|
108 |
+
(This is consistent with an HPSG-like usage of mark.)
|
109 |
+
- purpcl was removed, and is now viewed as just a case of advcl.
|
110 |
+
- discourse was added. The lack of a dependency type for
|
111 |
+
interjections was an omission even in the early versions, but it
|
112 |
+
became essential as we expanded our consideration of informal
|
113 |
+
text types. It is used for interjections, fillers, discourse markers
|
114 |
+
and emoticons.
|
115 |
+
- goeswith was added. In badly edited text, it is used to join the
|
116 |
+
two parts of a word.
|
117 |
+
|
118 |
+
A few other changes and improvements were also made, including improvements
|
119 |
+
in the recognition of advcl. There has been a reduction of "dep" dependencies
|
120 |
+
of about 14% on newswire (and higher on more informal text genres).
|
121 |
+
|
122 |
+
|
123 |
+
--------------------------------------
|
124 |
+
CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- v2.0.4, NOVEMBER 2012
|
125 |
+
|
126 |
+
A few minor changes and fixes were made: HYPH is now recognized, and treated
|
127 |
+
as punctuation and clausal complements of adjectives (including comparatives)
|
128 |
+
are recognized as ccomp.
|
129 |
+
|
130 |
+
--------------------------------------
|
131 |
+
|
132 |
+
CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- v1.6.9
|
133 |
+
|
134 |
+
This version adds an explicit root dependency in the set of dependencies
|
135 |
+
returned. In the past, there had been no explicit representation of the
|
136 |
+
root of the sentence in the set of dependencies returned, except in the
|
137 |
+
CoNLL format output, which always showed the root. Now, there is always
|
138 |
+
an explicit extra dependency that marks the sentence root, using a fake
|
139 |
+
ROOT pseudoword with index 0. That is, the root is marked in this way:
|
140 |
+
root(ROOT-0, depends-3)
|
141 |
+
Otherwise there were only a couple of minute changes in the dependencies
|
142 |
+
produced (appositions are now recognized in WHNPs!).
|
143 |
+
|
144 |
+
--------------------------------------
|
145 |
+
CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- v1.6.8
|
146 |
+
|
147 |
+
This version includes only small fixes, principally addressing some gaps
|
148 |
+
in the correct treatment of dependencies in inverted sentence (SQ and SINV)
|
149 |
+
constructions, and some errors in the treatment of copulas in the presence of
|
150 |
+
temporal NPs.
|
151 |
+
|
152 |
+
|
153 |
+
--------------------------------------
|
154 |
+
CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- NOVEMBER 2010 - JANUARY 2011
|
155 |
+
|
156 |
+
Two changes were made to the taxonomy of dependencies.
|
157 |
+
- measure (phrase modifier) was generalized and replaced by
|
158 |
+
npadvmod (noun phrase adverbial modifier) which includes measure
|
159 |
+
phrases and other adverbial uses of noun phrases. Temporal NPs
|
160 |
+
(tmod) are now a subtype of npadvmod in the dependency hierarchy.
|
161 |
+
- mwe (multi-word expression) is introduced for certain common
|
162 |
+
function word dependencies for which another good analysis isn't
|
163 |
+
easy to come by (and which were frequently dep before) such as
|
164 |
+
"instead of" or "rather than".
|
165 |
+
|
166 |
+
A new option has ben added to allow the copula to be treated as
|
167 |
+
the head when it has an adjective or noun complement.
|
168 |
+
|
169 |
+
The conversion software will now work fairly well with the
|
170 |
+
David Vadas version of the treebank with extra noun phrase
|
171 |
+
structure. (A few rare cases that are handled with the standard
|
172 |
+
treebank aren't yet handled, but you will get better dependencies
|
173 |
+
for compound nouns and multiword adjectival modifiers, etc.)
|
174 |
+
|
175 |
+
Considerable improvements were made in the coverage of named
|
176 |
+
dependencies. You should expect to see only about half as many generic
|
177 |
+
"dep" dependencies as in version 1.6.4.
|
178 |
+
|
179 |
+
--------------------------------------
|
180 |
+
CHANGES IN ENGLISH TYPED DEPENDENCIES CODE -- JUNE-AUGUST 2010
|
181 |
+
|
182 |
+
No new dependency relations have been introduced.
|
183 |
+
|
184 |
+
There have been some significant improvements in the generated
|
185 |
+
dependencies, principally covering:
|
186 |
+
- Better resolution of nsubj and dobj long distance dependencies
|
187 |
+
(but v1.6.4 fixes the overpercolation of dobj in v1.6.3)
|
188 |
+
- Better handling of conjunction distribution in CCprocessed option
|
189 |
+
- Correction of bug in v1.6.2 that made certain verb dependents noun
|
190 |
+
dependents.
|
191 |
+
- Better dependencies are generated for question structures (v1.6.4)
|
192 |
+
- Other minor improvements in recognizing passives, adverbial
|
193 |
+
modifiers, etc.
|
194 |
+
|
lng/L2SCA/stanford-parser-full-2014-01-04/StanfordDependenciesManual.pdf
ADDED
Binary file (307 kB). View file
|
|
lng/L2SCA/stanford-parser-full-2014-01-04/bin/makeSerialized.csh
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/csh -f
|
2 |
+
|
3 |
+
# This is the file we use to make the serialized grammars for the parser.
|
4 |
+
# If you are on the Stanford NLP machines, you can use it to remake the
|
5 |
+
# serialized parsers (such as when there have been incompatible software
|
6 |
+
# changes). Don't forget to klog first so you can access the AFS corpora.
|
7 |
+
#
|
8 |
+
# If you are not on the Stanford NLP machines, then the script won't work
|
9 |
+
# for you as is, since it contains hard-coded paths to various treebanks.
|
10 |
+
# But it may still be useful to inspect it to see what options we used to
|
11 |
+
# generate the various supplied grammars.
|
12 |
+
#
|
13 |
+
# NOTE: Output files in this script should ALWAYS use relative paths, so
|
14 |
+
# that you can copy this script and run it in a different directory and
|
15 |
+
# it will write output files there.
|
16 |
+
#
|
17 |
+
# usage:
|
18 |
+
# cd /u/nlp/data/lexparser # to have files output in "usual" location
|
19 |
+
# ./makeSerialized.csh
|
20 |
+
#
|
21 |
+
## Uncomment this bit to run it with older parser version
|
22 |
+
# setenv CLASSPATH /u/nlp/distrib/lexparser-2004-03-24/javanlp.jar:
|
23 |
+
|
24 |
+
if ( ! $?JAVANLP_HOME) then
|
25 |
+
echo 'JAVANLP_HOME is not set'
|
26 |
+
echo 'Add a line like setenv JAVANLP_HOME $HOME/javanlp to your environment'
|
27 |
+
exit
|
28 |
+
endif
|
29 |
+
|
30 |
+
set wsjptb=/afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj
|
31 |
+
# now ctb6
|
32 |
+
set ctb=/afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed
|
33 |
+
# now ctb7!
|
34 |
+
set ctb7train=/u/nlp/data/chinese/ctb7/train.mrg
|
35 |
+
set ctb7test=/u/nlp/data/chinese/ctb7/test.mrg
|
36 |
+
set negra=/afs/ir/data/linguistic-data/NEGRA/penn-format-train-dev-test
|
37 |
+
|
38 |
+
set host=`hostname | cut -d. -f1`
|
39 |
+
|
40 |
+
if ( ! -r $wsjptb) then
|
41 |
+
echo "Can't read WSJ PTB. Maybe you forgot to klog??"
|
42 |
+
exit
|
43 |
+
endif
|
44 |
+
|
45 |
+
mv -f serializedParsers.log serializedParsers.bak
|
46 |
+
uptime > serializedParsers.log
|
47 |
+
echo "Classpath is $CLASSPATH" >> serializedParsers.log
|
48 |
+
|
49 |
+
# English WSJ 2-21 PCFG binary and text grammars
|
50 |
+
|
51 |
+
( echo "Running wsjPCFG (goodPCFG) on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -goodPCFG -saveToSerializedFile wsjPCFG.ser.gz -saveToTextFile wsjPCFG.txt -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
|
52 |
+
|
53 |
+
# English noTagSplit no rule compaction PCFG text grammar
|
54 |
+
( echo "Running wsjPCFG-noTagSplit-noCompact on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -goodPCFG -noTagSplit -saveToTextFile wsjPCFG-noTagSplit.txt -compactGrammar 0 -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
|
55 |
+
|
56 |
+
# English WSJ 2-21 Factored binary
|
57 |
+
|
58 |
+
## Not yet clear that goodFactored is better than -ijcai03 -- not on dev set
|
59 |
+
# ( echo "Running wsjFactored (goodFactored) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDAtsv" -goodFactored -saveToSerializedFile wsjFactored.ser.gz -saveToTextFile wsjFactored.txt -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
|
60 |
+
( echo "Running wsjFactored (ijcai03 correctTags) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -ijcai03 -v -printStates -compactGrammar 0 -correctTags -saveToSerializedFile wsjFactored.ser.gz -saveToTextFile wsjFactored.txt -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
|
61 |
+
( echo "Running wsjFactored (ijcai03 replication) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -ijcai03 -v -printStates -compactGrammar 0 -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
|
62 |
+
|
63 |
+
|
64 |
+
## "General English" models
|
65 |
+
|
66 |
+
# english{Factored|PCFG} is currently trained on:
|
67 |
+
# - WSJ sections 1-21
|
68 |
+
# - Genia as reformatted by Andrew Clegg, his training split
|
69 |
+
# - 2 English Chinese Translation Treebank and 3 English Arabic Translation
|
70 |
+
# Treebank files backported to the original treebank annotation standards
|
71 |
+
# (by us)
|
72 |
+
# - 95 sentences parsed by us (mainly questions and imperatives; a few from
|
73 |
+
# recent newswire).
|
74 |
+
|
75 |
+
# /u/nlp/data/genia/sentences_cleaned.tree
|
76 |
+
|
77 |
+
# "General English" Factored binary
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
( echo "Running englishFactored (from treebank) on $host server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -ijcai03 -saveToSerializedFile englishFactored.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
|
82 |
+
|
83 |
+
# "General English" PCFG binary
|
84 |
+
|
85 |
+
( echo "Running englishPCFG (from treebank) on $host server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -saveToSerializedFile englishPCFG.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
|
86 |
+
|
87 |
+
|
88 |
+
# "General English" PCFG, case insensitive, binary
|
89 |
+
|
90 |
+
( echo "Running caseless englishPCFG (from treebank) on $host server" ; time java -mx4g edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.LowercaseAndAmericanizeFunction -evals factDA,tsv -goodPCFG -saveToSerializedFile englishPCFG.caseless.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
|
91 |
+
|
92 |
+
|
93 |
+
# English WSJ 2-21 PCFG simplified grammar
|
94 |
+
# This dumbed down parser is used by the RNN parser.
|
95 |
+
# See /scr/nlp/data/dvparser for more details.
|
96 |
+
( echo "Running wsj pcfg (simplified for use in the RNN parser) on $host -server" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -goodPCFG -noRightRec -dominatesV 0 -baseNP 0 -saveToSerializedFile wsjPCFG.nocompact.simple.ser.gz -maxLength 40 -compactGrammar 0 -train /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219 ) >>& ./serializedParsers.log
|
97 |
+
|
98 |
+
# English with extras PCFG simplified grammar
|
99 |
+
# This dumbed down parser is used by the RNN parser.
|
100 |
+
# See /scr/nlp/data/dvparser for more details.
|
101 |
+
( echo "Running english pcfg (simplified for use in the RNN parser) on $host -server" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -noRightRec -dominatesV 0 -baseNP 0 -saveToSerializedFile englishPCFG.nocompact.simple.ser.gz -maxLength 40 -compactGrammar 0 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219 ) >>& ./serializedParsers.log
|
102 |
+
|
103 |
+
|
104 |
+
# Xinhua Mainland Chinese PCFG binary
|
105 |
+
|
106 |
+
( echo "Running xinhuaPCFG on $host -server" ; time java -server -mx800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chinesePCFG -saveToSerializedFile xinhuaPCFG.ser.gz -maxLength 40 -train $ctb 026-270,301-499,600-999 -test $ctb 001-025 ) >>& ./serializedParsers.log
|
107 |
+
# new train list (Galen and Huihsin): 026-270,301-499,555-589,597-1041
|
108 |
+
# newer train list (Galen and Huihsin): 026-270,301-499,600-999
|
109 |
+
# this is all Xinhua minus Stanford devel and Bikel test
|
110 |
+
|
111 |
+
# Xinhua Mainland Chinese Factored binary
|
112 |
+
|
113 |
+
( echo "Running xinhuaFactored on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -acl03chinese -scTags -saveToSerializedFile xinhuaFactored.ser.gz -maxLength 40 -train $ctb 026-270,301-499,600-999 -test $ctb 001-025 ) >>& ./serializedParsers.log
|
114 |
+
|
115 |
+
# Mixed dialect Chinese on lots of data (with chineseFactored)
|
116 |
+
|
117 |
+
( echo "Running chineseFactored on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chineseFactored -saveToSerializedFile chineseFactored.ser.gz -maxLength 40 -train $ctb7train -test $ctb7test ) >>& ./serializedParsers.log
|
118 |
+
# new train list (Galen and Huihsin): 026-270,301-499,555-589,597-1041
|
119 |
+
# newer train list (Galen and Huihsin): 026-270,301-499,600-999
|
120 |
+
# this is all Xinhua minus Stanford devel and Bikel test
|
121 |
+
# CTB files 001-499, 555-589,597-1000 are from newswire of
|
122 |
+
# XinHua.
|
123 |
+
# Files 500-554 are Information Services Department of HKSAR.
|
124 |
+
# Files 590-596 and 1001-1151 are Sinorama articles, more of literature
|
125 |
+
# nature and from Taiwan.
|
126 |
+
# Files 2000-3145 are ACE broadcast news (from where?). We only use a few for now.
|
127 |
+
|
128 |
+
# Mixed dialect Chinese PCFG on lots of data
|
129 |
+
|
130 |
+
( echo "Running chinesePCFG on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chinesePCFG -useUnicodeType -saveToSerializedFile chinesePCFG.ser.gz -maxLength 40 -train $ctb7train -test $ctb7test ) >>& ./serializedParsers.log
|
131 |
+
# new train list (Galen and Huihsin): 026-270,301-499,555-589,597-1041
|
132 |
+
# newer train list (Galen and Huihsin): 026-270,301-499,600-999
|
133 |
+
# this is all Xinhua minus Stanford devel and Bikel test
|
134 |
+
|
135 |
+
|
136 |
+
# Chinese parser for unsegmented Chinese
|
137 |
+
|
138 |
+
( echo "Running xinhuaFactoredSegmenting on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -segmentMarkov -train $ctb 26-270,301-499,600-999 -sctags -acl03chinese -saveToSerializedFile xinhuaFactoredSegmenting.ser.gz ) >>& ./serializedParsers.log
|
139 |
+
java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -encoding utf-8 xinhuaFactoredSegmenting.ser.gz /u/nlp/data/lexparser/chinese-onesent-unseg-utf8.txt >>& ./serializedParsers.log
|
140 |
+
|
141 |
+
|
142 |
+
# It used to be the case that explicitly saying tLPP on command line was
|
143 |
+
# needed for file encoding. But it has been fixed.
|
144 |
+
# ( echo "Running xinhuaFactored from serialized check on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -maxLength 40 -loadFromSerializedFile xinhuaFactored.ser.gz -test $ctb 001-025 ) >>& ./serializedParsers.log
|
145 |
+
# This now works
|
146 |
+
( echo "Running xinhuaFactored from serialized (check without specifying -tLPP) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -maxLength 40 -loadFromSerializedFile xinhuaFactored.ser.gz -test $ctb 001-025 ) >>& ./serializedParsers.log
|
147 |
+
|
148 |
+
( echo "Running chinesePCFG (simplified for use in the RNN parser) on $host -server" ; time java -server -mx4g edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chineseFactored -PCFG -compactGrammar 0 -saveToSerializedFile chinesePCFG-simple.ser.gz -maxLength 40 -train $ctb7train -test $ctb7test ) >>& ./serializedParsers.log
|
149 |
+
|
150 |
+
# German Factored binary from Negra (version 2)
|
151 |
+
# $negra 3 is the dev set
|
152 |
+
|
153 |
+
( echo "Running germanFactored on $host -server" ; time java -server -mx5g edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 2 -maxLength 40 -nodeCleanup 2 -saveToSerializedFile germanFactored.ser.gz -train $negra 1 -test $negra 3 ) >>& ./serializedParsers.log
|
154 |
+
|
155 |
+
# German PCFG from Negra (version 2)
|
156 |
+
|
157 |
+
( echo "Running germanPCFG on $host -server" ; time java -server -mx2g edu.stanford.nlp.parser.lexparser.LexicalizedParser -v -evals tsv -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -PCFG -hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 1 -maxLength 40 -nodeCleanup 2 -saveToSerializedFile germanPCFG.ser.gz -train $negra 1 -test $negra 3 ) >>& ./serializedParsers.log
|
158 |
+
|
159 |
+
# German Dependency parser
|
160 |
+
# This requires normalizing the dependency output to strip boundary symbol.
|
161 |
+
# ( echo "Running germanDep on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -dep -hMarkov 1 -maxLength 40 -saveToSerializedFile germanDep.ser.gz -train $negra 1 -test $negra 3 ) >>& ./serializedParsers.log
|
162 |
+
|
163 |
+
|
164 |
+
|
165 |
+
########
|
166 |
+
# The languages below this line use TreebankPreprocessor for pre-processing prior to training
|
167 |
+
########
|
168 |
+
set mydir=`pwd`
|
169 |
+
set data_dir=/u/nlp/data/lexparser/trees
|
170 |
+
set tree_pipe=$JAVANLP_HOME/projects/core/scripts/run-tb-preproc
|
171 |
+
set train_sh=$JAVANLP_HOME/projects/core/scripts/lexparser-lang-train-test.sh
|
172 |
+
|
173 |
+
if( ! -e $data_dir ) then
|
174 |
+
mkdir $data_dir
|
175 |
+
endif
|
176 |
+
|
177 |
+
########
|
178 |
+
# ARABIC
|
179 |
+
########
|
180 |
+
set ar_data_dir=$data_dir/Arabic
|
181 |
+
set ar_conf_file=$JAVANLP_HOME/projects/core/src/edu/stanford/nlp/international/arabic/pipeline/configurations/atb-latest.conf
|
182 |
+
set ar_train_args="Arabic 40 $ar_data_dir/2-Unvoc-All.utf8.txt $ar_data_dir/2-Unvoc-Dev.utf8.txt BASELINE_ar -saveToSerializedFile arabicFactored.ser.gz"
|
183 |
+
|
184 |
+
if( ! -e $ar_data_dir ) then
|
185 |
+
mkdir $ar_data_dir
|
186 |
+
endif
|
187 |
+
|
188 |
+
echo Running $tree_pipe -p $ar_data_dir -v $ar_conf_file >>& ./serializedParsers.log
|
189 |
+
$tree_pipe -p $ar_data_dir -v $ar_conf_file >& $ar_data_dir/build.log
|
190 |
+
|
191 |
+
echo "" >>& ./serializedParsers.log
|
192 |
+
( echo "Training Arabic Factored grammar using baseline feature set" ; time $train_sh $ar_train_args ) >>& ./serializedParsers.log
|
193 |
+
|
194 |
+
|
195 |
+
########
|
196 |
+
# FRENCH
|
197 |
+
########
|
198 |
+
set fr_data_dir=$data_dir/French
|
199 |
+
set fr_conf_file=$JAVANLP_HOME/projects/core/src/edu/stanford/nlp/international/french/pipeline/configurations/ftb-latest.conf
|
200 |
+
set fr_train_args="French 40 $fr_data_dir/FTB-All.utf8.txt $fr_data_dir/FTB-Dev.utf8.txt BASELINE_fr -saveToSerializedFile frenchFactored.ser.gz"
|
201 |
+
|
202 |
+
if( ! -e $fr_data_dir ) then
|
203 |
+
mkdir $fr_data_dir
|
204 |
+
endif
|
205 |
+
|
206 |
+
echo Running $tree_pipe -p $fr_data_dir -v $fr_conf_file >>& ./serializedParsers.log
|
207 |
+
$tree_pipe -p $fr_data_dir -v $fr_conf_file >& $fr_data_dir/build.log
|
208 |
+
|
209 |
+
echo "" >>& ./serializedParsers.log
|
210 |
+
echo time $train_sh $fr_train_args >>& ./serializedParsers.log
|
211 |
+
( echo "Training French Factored grammar using baseline feature set" ; time $train_sh $fr_train_args ) >>& ./serializedParsers.log
|
212 |
+
|
213 |
+
|
214 |
+
|
215 |
+
|
216 |
+
## English just to check parser code regression (not saved)
|
217 |
+
|
218 |
+
## Just for reference
|
219 |
+
( echo "Running wsjPCFG (acl03pcfg replication) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -acl03pcfg -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
|
220 |
+
|
221 |
+
## See if same results from serialized parser
|
222 |
+
( echo "Running wsjFactored (ijcai03 from serialized) on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -v -maxLength 40 -loadFromSerializedFile wsjFactored.ser.gz -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
|
223 |
+
# ( echo "Running wsjFactored (ijcai03 with nodeprune) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -ijcai03 -v -compactGrammar 0 -nodePrune true -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
|
224 |
+
|
225 |
+
## See if same results from text grammar parser
|
226 |
+
( echo "Running wsjFactored (ijcai03 from textGrammar) on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -v -maxLength 40 -loadFromTextFile wsjFactored.txt -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
|
227 |
+
|
228 |
+
uptime >> serializedParsers.log
|
229 |
+
|
230 |
+
mv -f serializedParsersPerformance.last serializedParsersPerformance.2ndlast
|
231 |
+
mv -f serializedParsersPerformance.current serializedParsersPerformance.last
|
232 |
+
echo -n "Parser run by $USER on " > serializedParsersPerformance.current
|
233 |
+
date >> serializedParsersPerformance.current
|
234 |
+
grep 'N: 253\|N: 393\|Done testing on treebank\|Running \| summary ' serializedParsers.log >> serializedParsersPerformance.current
|
235 |
+
echo >> serializedParsersPerformance.current
|
236 |
+
echo >> serializedParsersPerformance.current
|
237 |
+
|
238 |
+
cat serializedParsersPerformance.current >> serializedParsersPerformance.txt
|
239 |
+
|
240 |
+
cp -f serializedParsers.last serializedParsers.2ndlast
|
241 |
+
cp -f serializedParsers.current serializedParsers.last
|
242 |
+
cp -f serializedParsers.log serializedParsers.current
|
lng/L2SCA/stanford-parser-full-2014-01-04/bin/run-tb-preproc
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
#
|
3 |
+
# Convenience script for running
|
4 |
+
# edu.stanford.nlp.trees.treebank.TreebankPreprocessor.
|
5 |
+
#
|
6 |
+
# This package automatically generates the Arabic and French
|
7 |
+
# parser training data from the respective source distributions.
|
8 |
+
#
|
9 |
+
# See the README for more details.
|
10 |
+
#
|
11 |
+
# author: Spence Green
|
12 |
+
##############################
|
13 |
+
|
14 |
+
import sys
|
15 |
+
from optparse import OptionParser
|
16 |
+
import os
|
17 |
+
import subprocess
|
18 |
+
from time import sleep
|
19 |
+
|
20 |
+
def run_treebank_pipeline(opts,conf_file):
|
21 |
+
cmd_line = 'java -Xmx%s -Xms%s edu.stanford.nlp.trees.treebank.TreebankPreprocessor' % (opts.jmem,opts.jmem)
|
22 |
+
|
23 |
+
if opts.verbose:
|
24 |
+
cmd_line = cmd_line + ' -v'
|
25 |
+
|
26 |
+
if opts.extra:
|
27 |
+
cmd_line = cmd_line + ' ' + opts.extra
|
28 |
+
|
29 |
+
if opts.output_path:
|
30 |
+
cmd_line = cmd_line + ' -p ' + opts.output_path
|
31 |
+
|
32 |
+
cmd_line = cmd_line + ' ' + conf_file
|
33 |
+
|
34 |
+
p = call_command(cmd_line)
|
35 |
+
|
36 |
+
while p.poll() == None:
|
37 |
+
out_str = p.stdout.readline()
|
38 |
+
if out_str != '':
|
39 |
+
print out_str[:-1]
|
40 |
+
|
41 |
+
# TODO: this will not handle spaces in the input or output paths
|
42 |
+
def call_command(command):
|
43 |
+
process = subprocess.Popen(command.split(' '), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
44 |
+
return process
|
45 |
+
|
46 |
+
def main():
|
47 |
+
usage = 'usage: %prog [opts] conf_file'
|
48 |
+
parser = OptionParser(usage=usage)
|
49 |
+
parser.add_option('-m','--java-mem',dest='jmem',default='500m',help='Set JVM memory heap size (e.g. 500m)')
|
50 |
+
parser.add_option('-v','--verbose',dest='verbose',action='store_true',default=False,help='Verbose mode')
|
51 |
+
parser.add_option('-o','--options',dest='extra',help='Pass options directly to TreebankPreprocessor')
|
52 |
+
parser.add_option('-p','--output-path',dest='output_path',help="Destination directory for the output")
|
53 |
+
|
54 |
+
(opts,args) = parser.parse_args()
|
55 |
+
|
56 |
+
if len(args) != 1:
|
57 |
+
parser.print_help()
|
58 |
+
sys.exit(-1)
|
59 |
+
|
60 |
+
conf_file = args[0]
|
61 |
+
|
62 |
+
run_treebank_pipeline(opts,conf_file)
|
63 |
+
|
64 |
+
if __name__ == '__main__':
|
65 |
+
main()
|
lng/L2SCA/stanford-parser-full-2014-01-04/build.xml
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!-- build.xml file for ant for JavaNLP -->
|
2 |
+
|
3 |
+
<!-- A "project" describes a set of targets that may be requested
|
4 |
+
when Ant is executed. The "default" attribute defines the
|
5 |
+
target which is executed if no specific target is requested,
|
6 |
+
and the "basedir" attribute defines the current working directory
|
7 |
+
from which Ant executes the requested task. This is normally
|
8 |
+
set to the current working directory.
|
9 |
+
-->
|
10 |
+
|
11 |
+
<project name="JavaNLP" default="compile" basedir=".">
|
12 |
+
|
13 |
+
<property name="build.home" value="${basedir}/classes"/>
|
14 |
+
<property name="build.tests" value="${basedir}/classes"/>
|
15 |
+
<property name="docs.home" value="${basedir}/docs"/>
|
16 |
+
<property name="src.home" value="${basedir}/src"/>
|
17 |
+
<property name="javadoc.home" value="${basedir}/javadoc"/>
|
18 |
+
|
19 |
+
|
20 |
+
<!-- ==================== Compilation Control Options ==================== -->
|
21 |
+
|
22 |
+
<!--
|
23 |
+
|
24 |
+
These properties control option settings on the Javac compiler when it
|
25 |
+
is invoked using the <javac> task.
|
26 |
+
|
27 |
+
compile.debug Should compilation include the debug option?
|
28 |
+
|
29 |
+
compile.deprecation Should compilation include the deprecation option?
|
30 |
+
|
31 |
+
compile.optimize Should compilation include the optimize option?
|
32 |
+
|
33 |
+
compile.source Source version compatibility
|
34 |
+
|
35 |
+
compile.target Target class version compatibility
|
36 |
+
|
37 |
+
-->
|
38 |
+
|
39 |
+
<property name="compile.debug" value="true"/>
|
40 |
+
<property name="compile.deprecation" value="false"/>
|
41 |
+
<property name="compile.optimize" value="true"/>
|
42 |
+
<property name="compile.source" value="1.6" />
|
43 |
+
<property name="compile.target" value="1.6" />
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
<!-- ==================== All Target ====================================== -->
|
49 |
+
|
50 |
+
<!--
|
51 |
+
|
52 |
+
The "all" target is a shortcut for running the "clean" target followed
|
53 |
+
by the "compile" target, to force a complete recompile.
|
54 |
+
|
55 |
+
-->
|
56 |
+
|
57 |
+
<target name="all" depends="clean,compile"
|
58 |
+
description="Clean build and dist directories, then compile"/>
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
<!-- ==================== Clean Target ==================================== -->
|
63 |
+
|
64 |
+
<!--
|
65 |
+
|
66 |
+
The "clean" target deletes any previous "build" and "dist" directory,
|
67 |
+
so that you can be ensured the application can be built from scratch.
|
68 |
+
|
69 |
+
-->
|
70 |
+
|
71 |
+
<target name="clean" description="Delete old classes">
|
72 |
+
<delete dir="${build.home}/edu"/>
|
73 |
+
</target>
|
74 |
+
|
75 |
+
|
76 |
+
<!-- ==================== Classpath Targets ==================================== -->
|
77 |
+
|
78 |
+
<!--
|
79 |
+
|
80 |
+
Sets the classpath for this project properly. We now always use the
|
81 |
+
lib dir within javanlp.
|
82 |
+
|
83 |
+
-->
|
84 |
+
|
85 |
+
<target name="classpath" description="Sets the classpath">
|
86 |
+
<path id="compile.classpath">
|
87 |
+
<fileset dir="${basedir}">
|
88 |
+
<include name="*.jar"/>
|
89 |
+
<exclude name="stanford-parser*"/>
|
90 |
+
</fileset>
|
91 |
+
</path>
|
92 |
+
</target>
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
<!-- ==================== Compile Target ================================== -->
|
99 |
+
|
100 |
+
<!--
|
101 |
+
|
102 |
+
The "compile" target transforms source files (from your "src" directory)
|
103 |
+
into object files in the appropriate location in the build directory.
|
104 |
+
This example assumes that you will be including your classes in an
|
105 |
+
unpacked directory hierarchy under "/WEB-INF/classes".
|
106 |
+
|
107 |
+
-->
|
108 |
+
|
109 |
+
<target name="compile" depends="prepare,classpath"
|
110 |
+
description="Compile Java sources">
|
111 |
+
|
112 |
+
<!-- Compile Java classes as necessary -->
|
113 |
+
<mkdir dir="${build.home}"/>
|
114 |
+
<javac srcdir="${src.home}"
|
115 |
+
destdir="${build.home}"
|
116 |
+
debug="${compile.debug}"
|
117 |
+
encoding="utf-8"
|
118 |
+
deprecation="${compile.deprecation}"
|
119 |
+
optimize="${compile.optimize}"
|
120 |
+
source="${compile.source}"
|
121 |
+
target="${compile.target}"
|
122 |
+
includeantruntime="false">
|
123 |
+
<classpath refid="compile.classpath"/>
|
124 |
+
<compilerarg value="-Xmaxerrs"/>
|
125 |
+
<compilerarg value="20"/>
|
126 |
+
<!-- <compilerarg value="-Xlint"/> -->
|
127 |
+
</javac>
|
128 |
+
|
129 |
+
<!-- Copy application resources -->
|
130 |
+
<!--
|
131 |
+
<copy todir="${build.home}/WEB-INF/classes">
|
132 |
+
<fileset dir="${src.home}" excludes="**/*.java"/>
|
133 |
+
</copy>
|
134 |
+
-->
|
135 |
+
|
136 |
+
</target>
|
137 |
+
|
138 |
+
|
139 |
+
<!-- ==================== Javadoc Target ================================== -->
|
140 |
+
|
141 |
+
<!--
|
142 |
+
|
143 |
+
The "javadoc" target creates Javadoc API documentation for the Java
|
144 |
+
classes included in your application. Normally, this is only required
|
145 |
+
when preparing a distribution release, but is available as a separate
|
146 |
+
target in case the developer wants to create Javadocs independently.
|
147 |
+
|
148 |
+
-->
|
149 |
+
|
150 |
+
<target name="javadoc" depends="compile"
|
151 |
+
description="Create Javadoc API documentation">
|
152 |
+
|
153 |
+
<mkdir dir="${javadoc.home}"/>
|
154 |
+
<javadoc sourcepath="${src.home}"
|
155 |
+
destdir="${javadoc.home}"
|
156 |
+
maxmemory="768m"
|
157 |
+
author="true"
|
158 |
+
source="1.6"
|
159 |
+
Overview="${src.home}/edu/stanford/nlp/overview.html"
|
160 |
+
Doctitle="Stanford JavaNLP API Documentation"
|
161 |
+
Windowtitle="Stanford JavaNLP API"
|
162 |
+
packagenames="*">
|
163 |
+
<bottom><![CDATA[<FONT SIZE=2><A HREF=\"http://nlp.stanford.edu\">Stanford NLP Group</A></FONT>]]></bottom>
|
164 |
+
<link href="http://java.sun.com/j2se/1.6.0/docs/api/"/>
|
165 |
+
</javadoc>
|
166 |
+
|
167 |
+
</target>
|
168 |
+
|
169 |
+
|
170 |
+
<!-- ==================== Prepare Target ================================== -->
|
171 |
+
|
172 |
+
<!--
|
173 |
+
|
174 |
+
The "prepare" target is used to create the "build" destination directory,
|
175 |
+
and copy the static contents of your web application to it. If you need
|
176 |
+
to copy static files from external dependencies, you can customize the
|
177 |
+
contents of this task.
|
178 |
+
|
179 |
+
Normally, this task is executed indirectly when needed.
|
180 |
+
|
181 |
+
-->
|
182 |
+
|
183 |
+
<target name="prepare">
|
184 |
+
|
185 |
+
<!-- Create build directories as needed -->
|
186 |
+
<mkdir dir="${build.home}"/>
|
187 |
+
|
188 |
+
</target>
|
189 |
+
|
190 |
+
</project>
|
lng/L2SCA/stanford-parser-full-2014-01-04/conf/atb-latest.conf
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
###########################
|
2 |
+
# Baseline ATB Newswire Datasets
|
3 |
+
#
|
4 |
+
# This file creates the three data sets used in the current
|
5 |
+
# line of Arabic parsing research:
|
6 |
+
#
|
7 |
+
# (1) Raw (no Bies mapping) / Unvocalized ("Raw")
|
8 |
+
# (2) Bies + DT / Unvocalized ("Unvoc")
|
9 |
+
# (3) Bies + DT / Vocalized ("Voc")
|
10 |
+
# (4) Bies + DT / Unvocalized ("NoDashTags")
|
11 |
+
# -No traces or phrasal tag decorations. For training the Berkeley parser.
|
12 |
+
#
|
13 |
+
# Note that "Bies + DT" refers to the enhancement to the Bies mappings
|
14 |
+
# proposed by Kulick et al. (2006).
|
15 |
+
#
|
16 |
+
# The training/dev/test set is the "Mona Diab split" from the 2005 JHU
|
17 |
+
# workshop on parsing Arabic dialects (Chiang et al., 2006).
|
18 |
+
#
|
19 |
+
#
|
20 |
+
# IMPORTANT: All paths should reference the base Arabic data directory
|
21 |
+
#
|
22 |
+
# /u/nlp/data/Arabic
|
23 |
+
#
|
24 |
+
###########################
|
25 |
+
|
26 |
+
NAME=1 Raw Train
|
27 |
+
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
|
28 |
+
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
|
29 |
+
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
|
30 |
+
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
|
31 |
+
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train
|
32 |
+
OUTPUT_ENCODING=UTF8
|
33 |
+
FLAT=true
|
34 |
+
|
35 |
+
;;
|
36 |
+
|
37 |
+
NAME=1 Raw Dev
|
38 |
+
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
|
39 |
+
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
|
40 |
+
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
|
41 |
+
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
|
42 |
+
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev
|
43 |
+
OUTPUT_ENCODING=UTF8
|
44 |
+
FLAT=true
|
45 |
+
|
46 |
+
;;
|
47 |
+
|
48 |
+
NAME=1 Raw Test
|
49 |
+
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
|
50 |
+
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
|
51 |
+
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
|
52 |
+
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
|
53 |
+
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test
|
54 |
+
OUTPUT_ENCODING=UTF8
|
55 |
+
FLAT=true
|
56 |
+
|
57 |
+
;;
|
58 |
+
|
59 |
+
NAME=2 Unvoc All
|
60 |
+
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
|
61 |
+
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
|
62 |
+
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
|
63 |
+
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
|
64 |
+
OUTPUT_ENCODING=UTF8
|
65 |
+
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
66 |
+
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
67 |
+
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
|
68 |
+
USEDET=true
|
69 |
+
|
70 |
+
;;
|
71 |
+
|
72 |
+
NAME=2 Unvoc Train
|
73 |
+
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
|
74 |
+
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
|
75 |
+
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
|
76 |
+
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
|
77 |
+
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train
|
78 |
+
OUTPUT_ENCODING=UTF8
|
79 |
+
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
80 |
+
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
81 |
+
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
|
82 |
+
USEDET=true
|
83 |
+
|
84 |
+
;;
|
85 |
+
|
86 |
+
NAME=2 Unvoc Dev
|
87 |
+
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
|
88 |
+
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
|
89 |
+
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
|
90 |
+
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
|
91 |
+
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev
|
92 |
+
OUTPUT_ENCODING=UTF8
|
93 |
+
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
94 |
+
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
95 |
+
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
|
96 |
+
USEDET=true
|
97 |
+
|
98 |
+
;;
|
99 |
+
|
100 |
+
NAME=2 Unvoc Test
|
101 |
+
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
|
102 |
+
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
|
103 |
+
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
|
104 |
+
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
|
105 |
+
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test
|
106 |
+
OUTPUT_ENCODING=UTF8
|
107 |
+
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
108 |
+
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
109 |
+
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
|
110 |
+
USEDET=true
|
111 |
+
|
112 |
+
;;
|
113 |
+
|
114 |
+
NAME=3 Voc Train
|
115 |
+
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
|
116 |
+
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/with-vowel
|
117 |
+
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/with-vowel
|
118 |
+
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/with-vowel
|
119 |
+
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train
|
120 |
+
OUTPUT_ENCODING=UTF8
|
121 |
+
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
122 |
+
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
123 |
+
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
|
124 |
+
USEDET=true
|
125 |
+
LEXMAPPER=edu.stanford.nlp.international.arabic.pipeline.UnvocLexicalMapper
|
126 |
+
FLAT=true
|
127 |
+
|
128 |
+
;;
|
129 |
+
|
130 |
+
NAME=3 Voc Dev
|
131 |
+
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
|
132 |
+
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/with-vowel
|
133 |
+
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/with-vowel
|
134 |
+
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/with-vowel
|
135 |
+
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev
|
136 |
+
OUTPUT_ENCODING=UTF8
|
137 |
+
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
138 |
+
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
139 |
+
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
|
140 |
+
USEDET=true
|
141 |
+
LEXMAPPER=edu.stanford.nlp.international.arabic.pipeline.UnvocLexicalMapper
|
142 |
+
FLAT=true
|
143 |
+
|
144 |
+
;;
|
145 |
+
|
146 |
+
NAME=3 Voc Test
|
147 |
+
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
|
148 |
+
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/with-vowel
|
149 |
+
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/with-vowel
|
150 |
+
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/with-vowel
|
151 |
+
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test
|
152 |
+
OUTPUT_ENCODING=UTF8
|
153 |
+
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
154 |
+
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
155 |
+
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
|
156 |
+
USEDET=true
|
157 |
+
LEXMAPPER=edu.stanford.nlp.international.arabic.pipeline.UnvocLexicalMapper
|
158 |
+
FLAT=true
|
159 |
+
|
160 |
+
;;
|
161 |
+
|
162 |
+
|
163 |
+
NAME=4 Unvoc Train NoDashTags
|
164 |
+
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
|
165 |
+
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
|
166 |
+
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
|
167 |
+
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
|
168 |
+
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/train
|
169 |
+
OUTPUT_ENCODING=UTF8
|
170 |
+
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
171 |
+
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
172 |
+
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
|
173 |
+
NODASHTAGS=true
|
174 |
+
ADDROOT=true
|
175 |
+
USEDET=true
|
176 |
+
|
177 |
+
;;
|
178 |
+
|
179 |
+
NAME=4 Unvoc Dev NoDashTags
|
180 |
+
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
|
181 |
+
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
|
182 |
+
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
|
183 |
+
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
|
184 |
+
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/dev
|
185 |
+
OUTPUT_ENCODING=UTF8
|
186 |
+
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
187 |
+
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
188 |
+
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
|
189 |
+
NODASHTAGS=true
|
190 |
+
ADDROOT=true
|
191 |
+
USEDET=true
|
192 |
+
|
193 |
+
;;
|
194 |
+
|
195 |
+
NAME=4 Unvoc Test NoDashTags
|
196 |
+
TYPE=edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset
|
197 |
+
PATHp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/data/penntree/without-vowel
|
198 |
+
PATHp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/data/penntree/without-vowel
|
199 |
+
PATHp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/data/penntree/without-vowel
|
200 |
+
SPLIT=/u/nlp/data/Arabic/splits/mona-diab-split/test
|
201 |
+
OUTPUT_ENCODING=UTF8
|
202 |
+
MAPPINGp1=/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
203 |
+
MAPPINGp2=/u/nlp/data/Arabic/ldc/atb-latest/p2/docs/atb2-v3.0-taglist-conversion-to-PennPOS-forrelease.lisp
|
204 |
+
MAPPINGp3=/u/nlp/data/Arabic/ldc/atb-latest/p3/docs/atb3-v3.1-taglist-conversion-to-PennPOS-forrelease.lisp
|
205 |
+
NODASHTAGS=true
|
206 |
+
ADDROOT=true
|
207 |
+
USEDET=true
|
208 |
+
|
209 |
+
;;
|
lng/L2SCA/stanford-parser-full-2014-01-04/conf/ftb-latest.conf
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
###########################
|
2 |
+
# Baseline FTB Datasets
|
3 |
+
#
|
4 |
+
# IMPORTANT: All paths should reference the base Arabic data directory
|
5 |
+
#
|
6 |
+
# /u/nlp/data/FrenchTreebank/versionJune2010
|
7 |
+
#
|
8 |
+
###########################
|
9 |
+
|
10 |
+
NAME=FTB All
|
11 |
+
TYPE=edu.stanford.nlp.international.french.pipeline.FTBDataset
|
12 |
+
PATH=/u/nlp/data/FrenchTreebank/versionJune2010/corpus-fonctions
|
13 |
+
OUTPUT_ENCODING=UTF8
|
14 |
+
TVISITOR=edu.stanford.nlp.international.french.pipeline.FTBCorrectorVisitor
|
15 |
+
FLAT=true
|
16 |
+
|
17 |
+
;;
|
18 |
+
|
19 |
+
NAME=FTB Train
|
20 |
+
TYPE=edu.stanford.nlp.international.french.pipeline.FTBDataset
|
21 |
+
PATH=/u/nlp/data/FrenchTreebank/versionJune2010/corpus-fonctions
|
22 |
+
SPLIT=$JAVANLP_HOME/projects/core/src/edu/stanford/nlp/international/french/pipeline/splits/candito.train
|
23 |
+
OUTPUT_ENCODING=UTF8
|
24 |
+
TVISITOR=edu.stanford.nlp.international.french.pipeline.FTBCorrectorVisitor
|
25 |
+
|
26 |
+
;;
|
27 |
+
|
28 |
+
NAME=FTB Dev
|
29 |
+
TYPE=edu.stanford.nlp.international.french.pipeline.FTBDataset
|
30 |
+
PATH=/u/nlp/data/FrenchTreebank/versionJune2010/corpus-fonctions
|
31 |
+
SPLIT=$JAVANLP_HOME/projects/core/src/edu/stanford/nlp/international/french/pipeline/splits/candito.dev
|
32 |
+
OUTPUT_ENCODING=UTF8
|
33 |
+
TVISITOR=edu.stanford.nlp.international.french.pipeline.FTBCorrectorVisitor
|
34 |
+
|
35 |
+
;;
|
36 |
+
|
37 |
+
NAME=FTB Test
|
38 |
+
TYPE=edu.stanford.nlp.international.french.pipeline.FTBDataset
|
39 |
+
PATH=/u/nlp/data/FrenchTreebank/versionJune2010/corpus-fonctions
|
40 |
+
SPLIT=$JAVANLP_HOME/projects/core/src/edu/stanford/nlp/international/french/pipeline/splits/candito.test
|
41 |
+
OUTPUT_ENCODING=UTF8
|
42 |
+
TVISITOR=edu.stanford.nlp.international.french.pipeline.FTBCorrectorVisitor
|
43 |
+
|
44 |
+
;;
|
lng/L2SCA/stanford-parser-full-2014-01-04/data/arabic-onesent-utf8.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
و نشر العدل من خلال قضاء مستقل .
|
lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-gb18030.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
���� ϣ�� ���� û�� ���� ������ �ƻ� ��
|
lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-unseg-gb18030.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
����Ժ��ǰ��������֪ͨ��Ҫ�������ʵ��ʵ��֤�г���Ӧ�ĸ������ߣ�ά����ʳƷ�۸��ȶ���
|
lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-unseg-utf8.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
国务院日前发出紧急通知,要求各地切实落实保证市场供应的各项政策,维护副食品价格稳定。
|
lng/L2SCA/stanford-parser-full-2014-01-04/data/chinese-onesent-utf8.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
俄国 希望 伊朗 没有 制造 核武器 计划 。
|
lng/L2SCA/stanford-parser-full-2014-01-04/data/english-onesent.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
The quick brown fox jumped over the lazy dog.
|
lng/L2SCA/stanford-parser-full-2014-01-04/data/french-onesent.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Le gouvernement se résout donc à renvoyer la balle dans le camp de partenaires qui ont amplement fait la preuve de leur incapacité à gérer le système de santé .
|
lng/L2SCA/stanford-parser-full-2014-01-04/data/german-onesent.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Wir haben nichts zu tun .
|
lng/L2SCA/stanford-parser-full-2014-01-04/data/pos-sentences.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
People can butter their bread with a knife .
|
2 |
+
People can butter/VB their bread with a knife .
|
3 |
+
People can butter/NN their bread with a knife .
|
4 |
+
People/NNS can/MD butter/VB their/PRP$ bread/NN with/IN a/DT knife/NN ./.
|
5 |
+
People/NNS can/VB butter/NN their/PRP$ bread/NN with/IN a/DT knife/NN ./.
|
6 |
+
People/NNS can/NN butter/NN their/PRP$ bread/NN with/IN a/DT knife/NN ./.
|
7 |
+
People/NN can/NN butter/NN their/NN bread/NN with/NN a/NN knife/NN ./NN
|
lng/L2SCA/stanford-parser-full-2014-01-04/data/testsent.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Scores of properties are under extreme fire threat as a huge blaze
|
2 |
+
continues to advance through Sydney's north-western suburbs. Fires
|
3 |
+
have also shut down the major road and rail links between Sydney and
|
4 |
+
Gosford.
|
5 |
+
|
6 |
+
The promotional stop in Sydney was everything to be expected for a
|
7 |
+
Hollywood blockbuster - phalanxes of photographers, a stretch limo to
|
8 |
+
a hotel across the Quay - but with one difference. A line-up of
|
9 |
+
masseurs was waiting to take the media in hand. Never has the term
|
10 |
+
"massaging the media" seemed so accurate.
|
lng/L2SCA/stanford-parser-full-2014-01-04/ejml-0.23.jar
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a0250933fe8cc6a44eb098016d4dadaba7746a27efc3d5a7f4f4c9bf247cfe09
|
3 |
+
size 211938
|
lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-gui.bat
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
:: runs the parser GUI
|
2 |
+
:: usage lexparser-gui [parserDataFilename [textFileName]]
|
3 |
+
java -mx800m -cp "*" edu.stanford.nlp.parser.ui.Parser
|
lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-gui.command
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
#
|
3 |
+
# Runs the Lexicalized Parser GUI. You can just run this script and then
|
4 |
+
# load a grammar and file to be parsed from the menus or you can specify
|
5 |
+
# them on the command line.
|
6 |
+
#
|
7 |
+
# Usage: ./lexparser-gui.sh [parserDataFilename [textFileName]]
|
8 |
+
#
|
9 |
+
|
10 |
+
|
11 |
+
scriptdir=`dirname $0`
|
12 |
+
|
13 |
+
java -mx800m -cp "$scriptdir/*" edu.stanford.nlp.parser.ui.Parser $*
|
lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-gui.sh
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
#
|
3 |
+
# Runs the Lexicalized Parser GUI. You can just run this script and then
|
4 |
+
# load a grammar and file to be parsed from the menus or you can specify
|
5 |
+
# them on the command line.
|
6 |
+
#
|
7 |
+
# Usage: ./lexparser-gui.sh [parserDataFilename [textFileName]]
|
8 |
+
#
|
9 |
+
|
10 |
+
|
11 |
+
scriptdir=`dirname $0`
|
12 |
+
|
13 |
+
java -mx800m -cp "$scriptdir/*" edu.stanford.nlp.parser.ui.Parser $*
|
lng/L2SCA/stanford-parser-full-2014-01-04/lexparser-lang-train-test.sh
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
#
|
3 |
+
# Defines standard configurations for training and evaluating the
|
4 |
+
# multilingual parsers (Arabic, Chinese, German, French). You can
|
5 |
+
# also train and test the English parsers with this script.
|
6 |
+
#
|
7 |
+
# For details on the language-specific options, see the javadocs and
|
8 |
+
# lexparser_lang.def.
|
9 |
+
#
|
10 |
+
|
11 |
+
# Memory limit
|
12 |
+
mem=6g
|
13 |
+
|
14 |
+
if [ ! $# -ge 5 ]; then
|
15 |
+
echo Usage: `basename $0` lang len train_file test_file out_file features
|
16 |
+
echo
|
17 |
+
echo ' lang : Language to parse (Arabic, English, Chinese, German, French)'
|
18 |
+
echo ' len : Maximum length of the sentences to parse'
|
19 |
+
echo ' train_file : Training treebank file'
|
20 |
+
echo ' test_file : Test treebank file (for evaluation)'
|
21 |
+
echo ' out_file : Prefix for the output filename'
|
22 |
+
echo ' features : Variable length list of optional parser features'
|
23 |
+
echo
|
24 |
+
echo 'Parser memory limit is currently:' "$mem"
|
25 |
+
echo
|
26 |
+
exit
|
27 |
+
fi
|
28 |
+
|
29 |
+
# Setup command-line options
|
30 |
+
lang=$1
|
31 |
+
len=$2
|
32 |
+
train_path=$3
|
33 |
+
test_file=$4
|
34 |
+
out_file=$5
|
35 |
+
|
36 |
+
shift 5
|
37 |
+
|
38 |
+
# Language-specific configuration
|
39 |
+
scriptdir=`dirname $0`
|
40 |
+
echo $JAVANLP_HOME
|
41 |
+
source $JAVANLP_HOME/projects/core/scripts/lexparser_lang.def
|
42 |
+
|
43 |
+
# Setting classpath
|
44 |
+
#CLASSPATH="$CLASSPATH":"$scriptdir/*"
|
45 |
+
|
46 |
+
# Run the Stanford parser
|
47 |
+
java -Xmx"$mem" -cp "$scriptdir/*:$CLASSPATH" edu.stanford.nlp.parser.lexparser.LexicalizedParser -maxLength "$len" \
|
48 |
+
-tLPP "$tlp" $lang_opts $* -writeOutputFiles \
|
49 |
+
-outputFilesExtension "$out_file"."$len".stp -outputFormat "penn" \
|
50 |
+
-outputFormatOptions "removeTopBracket,includePunctuationDependencies" -train "$train_path" -test "$test_file"
|