Spaces:
Sleeping
Sleeping
import java.util.Collection; | |
import java.util.List; | |
import java.io.StringReader; | |
import edu.stanford.nlp.process.Tokenizer; | |
import edu.stanford.nlp.process.TokenizerFactory; | |
import edu.stanford.nlp.process.CoreLabelTokenFactory; | |
import edu.stanford.nlp.process.DocumentPreprocessor; | |
import edu.stanford.nlp.process.PTBTokenizer; | |
import edu.stanford.nlp.ling.CoreLabel; | |
import edu.stanford.nlp.ling.HasWord; | |
import edu.stanford.nlp.ling.Sentence; | |
import edu.stanford.nlp.trees.*; | |
import edu.stanford.nlp.parser.lexparser.LexicalizedParser; | |
class ParserDemo { | |
/** | |
* The main method demonstrates the easiest way to load a parser. | |
* Simply call loadModel and specify the path of a serialized grammar | |
* model, which can be a file, a resource on the classpath, or even a URL. | |
* For example, this demonstrates loading from the models jar file, which | |
* you therefore need to include in the classpath for ParserDemo to work. | |
*/ | |
public static void main(String[] args) { | |
LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); | |
if (args.length > 0) { | |
demoDP(lp, args[0]); | |
} else { | |
demoAPI(lp); | |
} | |
} | |
/** | |
* demoDP demonstrates turning a file into tokens and then parse | |
* trees. Note that the trees are printed by calling pennPrint on | |
* the Tree object. It is also possible to pass a PrintWriter to | |
* pennPrint if you want to capture the output. | |
*/ | |
public static void demoDP(LexicalizedParser lp, String filename) { | |
// This option shows loading, sentence-segmenting and tokenizing | |
// a file using DocumentPreprocessor. | |
TreebankLanguagePack tlp = new PennTreebankLanguagePack(); | |
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); | |
// You could also create a tokenizer here (as below) and pass it | |
// to DocumentPreprocessor | |
for (List<HasWord> sentence : new DocumentPreprocessor(filename)) { | |
Tree parse = lp.apply(sentence); | |
parse.pennPrint(); | |
System.out.println(); | |
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); | |
Collection tdl = gs.typedDependenciesCCprocessed(); | |
System.out.println(tdl); | |
System.out.println(); | |
} | |
} | |
/** | |
* demoAPI demonstrates other ways of calling the parser with | |
* already tokenized text, or in some cases, raw text that needs to | |
* be tokenized as a single sentence. Output is handled with a | |
* TreePrint object. Note that the options used when creating the | |
* TreePrint can determine what results to print out. Once again, | |
* one can capture the output by passing a PrintWriter to | |
* TreePrint.printTree. | |
*/ | |
public static void demoAPI(LexicalizedParser lp) { | |
// This option shows parsing a list of correctly tokenized words | |
String[] sent = { "This", "is", "an", "easy", "sentence", "." }; | |
List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent); | |
Tree parse = lp.apply(rawWords); | |
parse.pennPrint(); | |
System.out.println(); | |
// This option shows loading and using an explicit tokenizer | |
String sent2 = "This is another sentence."; | |
TokenizerFactory<CoreLabel> tokenizerFactory = | |
PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); | |
Tokenizer<CoreLabel> tok = | |
tokenizerFactory.getTokenizer(new StringReader(sent2)); | |
List<CoreLabel> rawWords2 = tok.tokenize(); | |
parse = lp.apply(rawWords2); | |
TreebankLanguagePack tlp = new PennTreebankLanguagePack(); | |
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); | |
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); | |
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); | |
System.out.println(tdl); | |
System.out.println(); | |
// You can also use a TreePrint object to print trees and dependencies | |
TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); | |
tp.printTree(parse); | |
} | |
private ParserDemo() {} // static methods only | |
} | |