|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object | +--edu.stanford.nlp.ie.pcfg.PNPC
Statistical classifier of unseen proper noun phrases. Supports training and testing on data files. Uses an n-gram word-length model, and n-gram character model, and a word model.
Field Summary | |
protected int[] |
categoryCounts
|
List |
categoryNames
|
static int[] |
charBinCutoffs
|
protected double |
charConvergenceMargin
|
protected double[][][] |
charInterpolationConstants
|
protected HashMap[] |
charSequenceCounts
|
protected HashMap[] |
charSequenceTotalsByLength
|
protected int[] |
charTotalCounts
|
protected HashMap[] |
charWordInterpolationConstants
|
static int |
cn
|
static char |
END_SYMBOL
|
protected List[] |
heldOutExamples
|
protected int |
heldOutPercent
|
static int[] |
lengthBinCutoffs
|
protected double |
lengthConvergenceMargin
|
protected double[][][] |
lengthInterpolationConstants
|
protected double |
lengthNormalization
|
protected double[] |
lengthNormalizations
|
protected HashMap[] |
lengthSequenceCounts
|
static int |
ln
|
protected int |
maxPriorBoost
|
protected int |
numCategories
|
protected int |
numCharWordSteps
|
protected int |
numExamples
|
protected double |
priorBoost
|
static Random |
rand
|
static char |
START_SYMBOL
|
protected HashMap[] |
wordCountsByLength
|
protected int[] |
wordTotalCounts
|
protected HashMap[] |
wordTotalsByLength
|
Constructor Summary | |
PNPC(List categoryNames,
List trainingLines)
Constructs a new PNPC which is trained on the given file. |
Method Summary | |
protected void |
addCounts(String line,
int category)
Counts relevant statistics for the given example in its given category |
protected void |
computeCharSequenceTotals()
Computes the total probability of generating all words of a given length. |
String |
generateLine(int category)
Generates a novel example of the given category, starting with (cn-1) start symbols and ending with an end symbol. |
String |
generateWord(int wordLength,
String initialContext,
char finalChar,
int category)
Randomly generates a word of the given length, starting with the given intial context, and ending with the given final char by sampling from the char n-gram model of the given category. |
int |
getBestCategory(String line)
Returns the category that generates the given line with the highest probability. |
protected int |
getCharBin(String charSequence,
int category)
Returns the index of the appropriate EM interpolation parameter bin for the given char ngram. |
protected int |
getCharBinCount()
Returns the number of bins used for char EM interpolation. |
double |
getEmpiricalProb(List lengthSequence,
int category)
Returns the empirical estimate of the probability of the last word length in the sequence given the sequence excluding that length, as observed within the given category. |
double |
getEmpiricalProb(String charSequence,
int category)
Returns the empirical estimate of the probability of the last char in the sequence given the sequence excluding that char, as observed within the given category. |
double |
getEmpiricalProb(String word,
int wordLength,
int category)
Returns the empirical estimate of the probability of the given word given the word's length and the given category. |
static String |
getEndMarkedString(String line)
Returns the given line prepended with enough ' ' symbols to allow n-gram parsing. |
protected int |
getHeldOutScore()
Runs the classifier on the held-out examples and returns the number of correctly classified examples. |
double |
getInterpolatedProb(List lengthSequence,
int category)
Returns a linearly interpolated estimate of the last length in the sequence given the rest of it. |
double |
getInterpolatedProb(String charSequence,
int category)
Returns a linearly interpolated estimate of the last char in the sequence given the rest of it. |
protected int |
getLengthBin(List lengthSequence,
int category)
Returns the index of the appropriate EM interpolation parameter bin for the given length ngram. |
protected int |
getLengthBinCount()
Returns the number of bins used for char EM interpolation. |
int |
getNumCategories()
Returns the number of different categories represented in this classifier. |
double |
getPriorProb(int category)
Returns the empirical a piori probability of each category, as observed in the training data (fraction of each category in the whole training data). |
static String |
getPureString(String word)
Prunes the first (cn-1) chars from the beginning of the word as well as the final char. |
double |
getScore(String line,
int category)
Returns the score for the given example as scored in the given category. |
static List |
getWordLengths(String line)
Takes an end-marked string and returns a list of Integers for the length of each word. |
static List |
getWordsWithContext(String line)
Takes an end-marked string and returns a List of strings, one for each word in the line. |
protected void |
incrementCount(HashMap map,
Object key)
Adds 1 to the count for the given key in the given map. |
protected void |
incrementCountByLength(HashMap map,
int length,
Object key)
Adds 1 to the count for the given key in the given map under the given length. |
protected void |
initCounts()
Initializes and zeroes all variables and counts before training. |
protected void |
learnCharInterpolationConstants()
Learns good weights for deleted interpolation in the char n-gram model via EM. |
protected void |
learnCharWordInterpolationConstants()
Computes the best interpolation weights for the char n-gram vs word model with a line search. |
protected void |
learnLengthInterpolationConstants()
Learns good weights for deleted interpolation in the length n-gram model via EM. |
protected void |
learnLengthNormalizations()
Learns a constant for each category to normalize word probabilities by length. |
protected void |
learnPriorBoost()
Sets the log-prior multiplier (priorBoost) to the best value on the held-out set. |
protected void |
test(String testFilename)
Runs the classifier on each line in the given test file and prints out the category with the highest score. |
Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
public List categoryNames
public static final int ln
public static final int cn
public static final char START_SYMBOL
public static final char END_SYMBOL
public static final Random rand
public static final int[] charBinCutoffs
public static final int[] lengthBinCutoffs
protected int numCategories
protected int numExamples
protected double priorBoost
protected double lengthNormalization
protected final int heldOutPercent
protected List[] heldOutExamples
protected final double charConvergenceMargin
protected final double lengthConvergenceMargin
protected final int numCharWordSteps
protected final int maxPriorBoost
protected int[] categoryCounts
protected HashMap[] lengthSequenceCounts
protected double[][][] lengthInterpolationConstants
protected int[] wordTotalCounts
protected HashMap[] charSequenceCounts
protected double[][][] charInterpolationConstants
protected HashMap[] charSequenceTotalsByLength
protected int[] charTotalCounts
protected HashMap[] wordCountsByLength
protected HashMap[] wordTotalsByLength
protected HashMap[] charWordInterpolationConstants
protected double[] lengthNormalizations
Constructor Detail |
public PNPC(List categoryNames, List trainingLines)
Method Detail |
protected void initCounts()
protected void addCounts(String line, int category)
protected void incrementCount(HashMap map, Object key)
protected void incrementCountByLength(HashMap map, int length, Object key)
protected void learnPriorBoost()
protected void learnLengthInterpolationConstants()
protected void learnCharInterpolationConstants()
protected void learnCharWordInterpolationConstants()
protected void computeCharSequenceTotals()
protected void learnLengthNormalizations()
protected int getHeldOutScore()
public int getBestCategory(String line)
public double getScore(String line, int category)
public double getInterpolatedProb(String charSequence, int category)
public double getEmpiricalProb(String charSequence, int category)
public double getInterpolatedProb(List lengthSequence, int category)
public double getEmpiricalProb(List lengthSequence, int category)
public double getEmpiricalProb(String word, int wordLength, int category)
public double getPriorProb(int category)
public int getNumCategories()
public static String getEndMarkedString(String line)
public static String getPureString(String word)
public static List getWordLengths(String line)
public static List getWordsWithContext(String line)
protected int getCharBin(String charSequence, int category)
protected int getCharBinCount()
protected int getLengthBin(List lengthSequence, int category)
protected int getLengthBinCount()
protected void test(String testFilename) throws FileNotFoundException, IOException
FileNotFoundException
IOException
public String generateWord(int wordLength, String initialContext, char finalChar, int category)
public String generateLine(int category)
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |