Lexical Resources
- Bigram Hidden Markov Model (HMM): statistical model of POS tag co-occurrence
- HMM and word lexicon used to assign most likely tag
- Word morphology to classify unknown terms
sub _assign_tag {
my( $self, $prev_tag, $word ) = @_;
my $best_tag;
my $highest_prob = 0;
my $word_count = $lexicon->{$word}{count};
foreach my $tag ( keys %{ $hmm->{$prev_tag} } ){
# Shortcut for common words
next unless defined $lexicon->{$word}{$tag} or
$word_count < THRESHOLD;
my $prob = $hmm->{$prev_tag}{$tag} *
( 1 + ( $lexicon->{$word}{$tag} || 0 ) / $word_count );
if( $prob > $highest_prob ){
$highest_prob = $prob;
$best_tag = $tag;
}
}
return $best_tag;
}
|