Commit 91602c2f authored by Mathieu Rodic's avatar Mathieu Rodic

[FEAT] made MElt tagger independent from the installation of a software

https://forge.iscpif.fr/issues/1500
parent 4cead2ac
......@@ -52,19 +52,19 @@ class MeltTagger(Tagger):
def start(self, language='fr', melt_data_path='melttagger'):
basepath = os.path.dirname(os.path.realpath(__file__))
path = os.path.join(basepath, melt_data_path, language)
path = os.path.join(basepath, melt_data_path)
self._pos_tagger = POSTagger()
self._pos_tagger.load_tag_dictionary('%s/tag_dict.json' % path)
self._pos_tagger.load_lexicon('%s/lexicon.json' % path)
self._pos_tagger.load_model('%s' % path)
self._pos_tagger.load_tag_dictionary('%s/%s/tag_dict.json' % (path, language))
self._pos_tagger.load_lexicon('%s/%s/lexicon.json' % (path, language))
self._pos_tagger.load_model('%s/%s' % (path, language))
self._preprocessing_commands = (
# ('/usr/local/bin/clean_noisy_characters.sh', ),
# ('/usr/local/bin/MElt_normalizer.pl', '-nc', '-c', '-d', '/usr/local/share/melt/normalization/%s' % language, '-l', language, ),
('/usr/local/share/melt/segmenteur.pl', '-a', '-ca', '-af=/usr/local/share/melt/pctabr', '-p', 'r'),
('%s/MElt_normalizer.pl' % path, '-nc', '-c', '-d', '%s/%s' % (path, language), '-l', language, ),
('%s/segmenteur.pl' % path, '-a', '-ca', '-af=%s/pctabr' % path, '-p', 'r'),
)
self._lemmatization_commands = (
('/usr/local/bin/MElt_postprocess.pl', '-npp', '-l', language),
('MElt_lemmatizer.pl', '-m', '/usr/local/share/melt/%s' % language),
('%s/MElt_postprocess.pl' % path, '-npp', '-l', language),
('%s/MElt_lemmatizer.pl' % path, '-m', '%s/%s' % (path, language)),
)
def stop(self):
......
#!/usr/bin/perl
use utf8;
use locale;
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
use DBI;
use Encode;
my $datadir = ".";
my $language = "";
my $model = "";
my $lexfile = "";
my $it_mapping = 0;
my $flag_unknowns = "*";
my $verbose = 0;
my $multiple_lemmas = 0;
my $silent = 0;
while (1) {
$_ = shift;
if (/^-l$/) {$language = shift;}
elsif (/^-m$/) {$model = shift;}
elsif (/^-nv$/) {$silent = 1;}
elsif (/^-l?db$/) {$dbfile = shift;}
elsif (/^-nfu$/) {$flag_unknowns = "";}
elsif (/^-v$/) {$verbose = 1;}
elsif (/^-itmapping$/) {$it_mapping = 1;}
elsif (/^-lcl$/) {$lower_case_lemmas = 1;}
elsif (/^-ml$/) {$multiple_lemmas = 1;}
elsif (/^-h$/ || /^--?help^/) {
print STDERR <<END;
Usage: MElt_lemmatizer.pl [ -l language | -m model | -lex lexicon ] [ -nfu ] [ -itmapping ] [ -lcl ] < input > output
Input: POS-tagged text in Brown format. The text *must* have been tagged using MElt, as this lemmatizer is based
on the (external) lexicon used by a particular MElt model and on the tags assigned by MElt using this model
Brown format: word1/pos1 word2/pos2 ... wordn/posn (newline = new sentence)
Output: word1/pos1/lemma1 word2/pos2/lemma2 ... wordn/posn/lemman (newline = new sentence; lemmas for words
unknown to the lexicon are prefixed with '*')
Options:
-l language Use the lexicon of the default MElt model for language 'language'
-m model Use the lexicon of the MElt model to be found in the directory 'model'
-lex lexicon Use the lexicon provided
-v Verbose (outputs information about the options used on STDERR before lemmatizing)
-nfu Do not prefix lemmas for forms unknown to the lexicon with the character '*'
-lcl Output all lemmas in lowercase
-itmapping Triggers special conversion and adaptation rules for Italian
-h Print this
END
exit(0);
}
elsif (/^$/) {last}
}
if ($lang eq "it") {$itmapping = 1}
if ($dbfile eq "") {
if ($model ne "") {
if ($language ne "") {
die "Error: options -l and -m can not be used simultaneously";
}
} else {
if ($language eq "") {
$language = "fr";
}
$model = $datadir."/".$language;
}
$dbfile = $model."/lemmatization_data.db";
} else {
if ($language ne "" || $model ne "") {
die "Error: option -lex can not be used with options -l or -m";
}
}
if ($verbose) {
print STDERR "Lemmatization database used: $dbfile\n";
if ($flag_unknowns eq "") {
print STDERR "Lemmas for forms unknown to the lexicon are not prefixed by any special character\n" ;
} else {
print STDERR "Lemmas for forms unknown to the lexicon are prefixed with the character '$flag_unknowns'\n" ;
}
print STDERR "Lemmas are lowercased\n" if ($lower_case_lemmas);
print STDERR "Special mappings for Italian activated\n" if ($it_mapping);
}
my $dbh = DBI->connect("dbi:SQLite:$dbfile", "", "", {RaiseError => 1, AutoCommit => 0});
my $sth_cfl=$dbh->prepare('select lemma from cat_form2lemma where cat=? and form=?');
my $sth_cfslsc1=$dbh->prepare('select lemmasuff from cat_formsuff_lemmasuff2count where cat=? and formsuff=? limit 1');
my $sth_cfslsc2=$dbh->prepare('select lemmasuff from cat_formsuff_lemmasuff2count where cat=? and formsuff=? order by count limit 1');
my $sth_cfslsc3=$dbh->prepare('select lemmasuff from cat_formsuff_lemmasuff2count where cat=? and formsuff=?');
%equiv = (
"--RBR--" => ")",
"--LBR--" => "(",
"--RRB--" => ")",
"--LRB--" => "(",
);
print STDERR " LEMMATIZER: Lemmatizing...\n" unless $silent;
my %get_cat_form2lemma_cache;
my %includes_data_for_cat_formsuff_cache;
my %get_best_lemmasuffs_cache;
my %get_all_lemmasuffs_cache;
while (<>) {
chomp;
s/^\s+//;
s/\s+$//;
if (/^$/) {
print "\n";
next;
}
@result = ();
s/$/ /;
while (s/^ *((?:\[\|.*?\|\] *)?(?:\( *)?(?:{.*?} *)?)([^{ ][^ ]*?)\/([^\/ \)\|]+)((?: *[\|\)][\|\(\)]*)?) +([^ \|\)]|[\|\)][^ \|\)]|$)/$5/) {
$comment = $1;
$token = $2;
$cat = $3;
$post = $4;
$postcat = "";
if ($cat =~ s/(-UNK.*)$//) {
$postcat = $1;
}
$lemma = "";
if (get_cat_form2lemma($cat,$token) ne "") {
push @result, "$comment$token/$cat$postcat/".get_cat_form2lemma($cat,$token);
} elsif (get_cat_form2lemma($cat,lc($token)) ne "") {
push @result, "$comment$token/$cat$postcat/".get_cat_form2lemma($cat,lc($token));
} elsif (get_cat_form2lemma($cat,$equiv{$token}) ne "") {
push @result, "$comment$token/$cat$postcat/".get_cat_form2lemma($cat,$equiv{$token});
} elsif ($it_mapping && $token !~ /^[A-ZÉ]/ && $token =~ /^(.*?)(lo|la|mi|ne|gli|si|li|le)$/ && get_cat_form2lemma(VERB,lc($1)) ne "" && get_cat_form2lemma(PRON,lc($2)) ne "") {
if ($cat ne "PRON") {
push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma(VERB,lc($1));
} elsif ($cat eq "PRON") {
push @result, "$comment$token/$cat$postcat/".get_cat_form2lemma($cat,lc($2));
}
} elsif ($it_mapping && $token !~ /^[A-ZÉ]/ && $token =~ /^(.*?)(lo|la|mi|ne|gli|si|li|le)$/ && get_cat_form2lemma(VERB,lc($1."e")) ne "" && get_cat_form2lemma(PRON,lc($2)) ne "") {
if ($cat ne "PRON") {
push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma(VERB,lc($1."e"));
} elsif ($cat eq "PRON") {
push @result, "$comment$token/$cat$postcat/".get_cat_form2lemma($cat,lc($2));
}
} elsif ($it_mapping && $token !~ /^[A-ZÉ]/ && $token =~ /^(.*?)(.)(lo|la|mi|ne|gli|si|li|le)$/ && get_cat_form2lemma(VERB,lc($1.$2.$2."e")) ne "" && get_cat_form2lemma(PRON,lc($3)) ne "") {
if ($cat ne "PRON") {
push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma(VERB,lc($1.$2.$2."e"));
} elsif ($cat eq "PRON") {
push @result, "$comment$token/$cat$postcat/".get_cat_form2lemma($cat,lc($3));
}
} elsif ($it_mapping && $token !~ /^[A-ZÉ]/ && $token =~ /^(.*)[ai]$/ && $cat =~ /^(NOUN|ADJ|PRON)$/) {
if ($lower_case_lemmas) {
push @result, "$comment$token/$cat$postcat/".lc($1)."o";
} else {
push @result, "$comment$token/$cat$postcat/$1o";
}
} else {
if ($token !~ /^[A-ZÉ]/) {
$token_suff = $token;
$token_pref = "";
while ($token_suff =~ s/^(.)(?=.)//) {
$token_pref .= $1;
if (includes_data_for_cat_formsuff($cat,$token_suff)) {
if ($multiple_lemmas) {
$lemma = get_all_lemmasuffs($cat,$token_suff,$token_pref)
} else {
$lemma = get_best_lemmasuffs($cat,$token_suff,$token_pref);
}
last;
}
}
}
if ($lemma eq "") {$lemma = $token}
if ($lower_case_lemmas) {
push @result, "$comment$token/$cat$postcat/$flag_unknowns".lc($lemma);
} else {
push @result, "$comment$token/$cat$postcat/$flag_unknowns".$lemma;
}
}
}
$what_remains = $_;
$_ = join(" ",@result);
if ($what_remains =~ /^(\[\|.*?\|\])/) {
$_ .= $1;
}
$what_remains =~ s/^\s*//;
die $what_remains if ($what_remains ne "");
print $_.$post."\n";
}
print STDERR " LEMMATIZER: Lemmatizing: done\n" unless $silent;
sub get_cat_form2lemma {
my $cat = shift;
my $form = shift;
if (defined($get_cat_form2lemma_cache{$cat}{$form})) {
return $get_cat_form2lemma_cache{$cat}{$form};
}
$sth_cfl->execute($cat,$form);
my %results = ();
while (my $value = $sth_cfl->fetchrow) {
$results{Encode::decode("utf8",$value)} = 1;
}
$sth_cfl->finish;
my $result = (join "|", sort {$a cmp $b} keys %results);
$get_cat_form2lemma_cache{$cat}{$form} = $result;
return $result;
}
sub includes_data_for_cat_formsuff {
my $cat = shift;
my $formsuff = shift;
if (defined($includes_data_for_cat_formsuff_cache{$cat}{$formsuff})) {
return $includes_data_for_cat_formsuff_cache{$cat}{$formsuff};
}
$sth_cfslsc1->execute($cat,$formsuff);
my $result = 0;
while (my $value = $sth_cfslsc1->fetchrow) {
$result = 1;
last;
}
$sth_cfslsc1->finish;
$includes_data_for_cat_formsuff_cache{$cat}{$form} = $result;
return $result;
}
sub get_all_lemmasuffs {
my $cat = shift;
my $form = shift;
my $token_pref = shift;
if (defined($get_all_lemmasuffs_cache{$cat}{$form})) {
return $get_all_lemmasuffs_cache{$cat}{$form};
}
$sth_cfslsc3->execute($cat,$form);
my %results = ();
while (my $value = $sth_cfslsc3->fetchrow) {
$results{$token_pref.Encode::decode("utf8",$value)} = 1;
}
$sth_cfslsc3->finish;
my $result = (join "|", sort {$a cmp $b} keys %results);
$get_all_lemmasuffs_cache{$cat}{$form} = $result;
return $result;
}
sub get_best_lemmasuffs {
my $cat = shift;
my $form = shift;
my $token_pref = shift;
if (defined($get_best_lemmasuffs_cache{$cat}{$form})) {
return $get_best_lemmasuffs_cache{$cat}{$form};
}
$sth_cfslsc2->execute($cat,$form);
my $result;
while (my $value = $sth_cfslsc2->fetchrow) {
$result = $token_pref.Encode::decode("utf8",$value);
last;
}
$sth_cfslsc2->finish;
$get_best_lemmasuffs_cache{$cat}{$form} = $result;
return $result;
}
#!/usr/bin/perl
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
use utf8;
use locale;
$do_not_load_lexicon=0;
while (1) {
$_ = shift;
if (/^-d$/) {$ngrams_file_dir = shift}
elsif (/-nc$/) {$no_correction = 1}
elsif (/^-nolex$/) {$do_not_load_lexicon = 1}
elsif (/^-c$/) {$has_sxpipe_comments = 1}
elsif (/^-l$/) {$lang = shift || die "Please provide a language code after option -l"}
elsif (/^$/) {last}
else {die "Unknown option '$_'"}
}
if ($lang eq "zzz" || $no_correction) {
while (<>) {
chomp;
print $_."\n";
}
exit 0;
}
$ngrams_file_dir .= "/" unless $ngrams_file_dir eq "" || $ngrams_file_dir =~ /\/$/;
print STDERR " NORMALIZER: Loading lexical information for language $lang...\n";
if (-d "$ngrams_file_dir") {
unless ($do_not_load_lexicon) {
if (-e "${ngrams_file_dir}lex") {
open FILE, "${ngrams_file_dir}lex";
binmode FILE, ":utf8";
while (<FILE>) {
chomp;
s/(^|[^\\])#.*//;
next if /^\s*$/;
next if /^_/;
/^(.*?)\t(.*?)\t(.*)$/ || next;
$form = $1;
$cat = $2;
$ms = $3;
$form =~ s/__.*$//;
if ($lang eq "fr") {
$adj_nom_voyelle{$form} = 1 if ($cat =~ /^(adj|nom)/ && $form =~ /^[aeiuoé]/);
$verbe_voyelle{$form} = 1 if ($cat eq "v" && $form =~ /^[aeiuoé]/);
$inf{$form} = 1 if ($cat eq "v" && $ms eq "W");
$verbe_1s{$form} = 1 if ($cat eq "v" && $ms =~ /1/);
$lex_final_e{$form} = 1 if $form =~ /e$/;
$lex_final_s{$form} = 1 if $form =~ /s$/;
$lex_final_t{$form} = 1 if $form =~ /t$/;
}
$lex{$form} = 1;
}
close FILE;
if ($lang eq "fr") {
for (sort {length($b) <=> length($a)} keys %adj_nom_voyelle) {
if (!defined($lex{"l".$_})) {
$glueddet{"l".$_} = "{l$_◀l'} l' {} $_";
}
if (!defined($lex{"d".$_})) {
$glueddet{"d".$_} = "{d$_◀d'} d' {} $_";
}
}
for (sort {length($b) <=> length($a)} keys %verbe_voyelle) {
if (!defined($lex{"l".$_})) {
$gluedclit{"s".$_} = "{s$_◀s'} s' {} $_";
}
if (!defined($lex{"d".$_})) {
$gluedclit{"n".$_} = "{n$_◀n'} n' {} $_";
}
}
for (sort {length($b) <=> length($a)} keys %inf) {
if (!defined($lex{"2".$_})) {
$glued2{"2".$_} = "{2$_◀2=de} de {} $_";
}
}
for (sort {length($b) <=> length($a)} keys %verbe_1s) {
if (!defined($lex{"j".$_})) {
$gluedj{"j".$_} = "{j$_◀j'} j' {} $_";
}
if (!defined($lex{"J".$_})) {
$gluedj{"J".$_} = "{J$_◀J'} J' {} $_";
}
}
}
} else {
print STDERR " NORMALIZER: No normalization lexical information found for language '$lang'. Skipping\n";
}
}
print STDERR " NORMALIZER: Loading lexical information for language $lang: done\n";
print STDERR " NORMALIZER: Loading replacement patterns (${ngrams_file_dir}ngrams...)\n";
if (-e "${ngrams_file_dir}ngrams") {
open NGRAMS, "<${ngrams_file_dir}ngrams" || die $!;
binmode NGRAMS, ":utf8";
while (<NGRAMS>) {
/^([^_\t][^\t]*)\t([^\t]+)(\t|$)/ || next;
$in = $1;
$out = $2;
$newout = "";
if ($out =~ /\$\d/ || $in =~ /\\/) {
$in =~ s/(\[\^[^ \]]*) /\1‗/g;
}
@in = split / /, $in;
@out = split / /, $out;
my $j = 1;
if ($#in ne $#out) {
print STDERR " NORMALIZER: Ignoring replacement /$in/$out/ found (different input and output token number)\n";
} else {
for $i (0..$#in) {
if ($out =~ /\$\d/ || $in =~ /\\/) {
while ($in[$i] =~ s/\(.*?\)/\$$j/) {$j++;}
}
$newout .= "{$in[$i]◀".($#in+1)."} $out[$i] ";
}
}
$newout =~ s/ $//;
while ($newout =~ s/(}[^{]*) /$1 {} /g){}
if ($newout =~ /\$\d/ || $in =~ /\\/) {
$ngrams{qr/$in/} = $newout;
} else {
$ngrams{quotemeta($in)} = $newout;
}
}
close NGRAMS;
} else {
print STDERR " NORMALIZER: No replacement patterns found for language '$lang'. Skipping\n";
}
print STDERR " NORMALIZER: Loading replacement patterns: done\n";
} else {
print STDERR " NORMALIZER: No replacement patterns available for language '$lang'. Skipping\n";
}
print STDERR " NORMALIZER: Normalizing...\n";
while (<>) {
chomp;
$_ = " $_ ";
s/}\s*_/} _/g;
$is_maj_only = 0;
$tmp = $_;
$tmp =~ s/◀.*?}/}/g;
$tmp =~ s/{([^{}]+)} _[^ ]+/$1/g;
if ($tmp=~/^[^a-zâäàéèêëïîöôüûùÿ]+$/ && $tmp=~/[A-Z]{5,}/ && length($tmp) > 10) {
$is_maj_only = 1;
$_ = lc($_);
s/}\s*_(url|smiley|email|date[^ ]*|time|heure|adresse|underscore|acc_[of])/"} _".uc($1)/ge;
s/(-[lr][rcs]b-)/uc($1)/ge;
}
if ($has_sxpipe_comments) {
s/{([^{}]+)} *\1( |$)/\1\2/g;
}
for $ngram (sort {(($b=~s/([  ])/\1/g) <=> ($a=~s/([  ])/\1/g)) || (length($b) <=> length($a))} keys %ngrams) {
$t = $ngrams{$ngram};
$t =~ s/ / /g;
$ngram =~ s/ / /g;
$ngram =~ s/‗/ /g;
if ($t =~ /\$/) {
while (/(?<=[^}]) $ngram /) {
@v = ();
$v[1] = $1;
$v[2] = $2;
$v[3] = $3;
$v[4] = $4;
$v[5] = $5;
$v[6] = $6;
$v[7] = $7;
$v[8] = $8;
$v[9] = $9;
$tmp = $t;
for $i (1..9) {
$tmp =~ s/\$$i/$v[$i]/g;
}
s/(?<=[^}]) $ngram / $tmp /;
}
} else {
s/(?<=[^}]) $ngram / $t /g;
}
}
$tmp = $_;
$_ = "";
while ($tmp =~ s/^ *((?:{.*?} )?)(.*?) //) {
$orig = $1;
$target = $2;
$tmptarget = $target;
if ($lang eq "fr") {
if ($orig eq "" && length($target) >= 3 && $target !~ /[{}]/ && !defined($lex{$target}) && defined($glueddet{$target})) {
$_ .= $glueddet{$target}." ";
} elsif ($orig eq "" && length($target) >= 3 && $target !~ /[{}]/ &&!defined($lex{$target}) && defined($gluedclit{$target})) {
$_ .= $gluedclit{$target}." ";
} elsif ($orig eq "" && length($target) >= 3 && $target !~ /[{}]/ &&!defined($lex{$target}) && defined($glued2{$target})) {
$_ .= $glued2{$target}." ";
} elsif ($orig eq "" && length($target) >= 3 && $target !~ /[{}]/ &&!defined($lex{$target}) && defined($gluedj{$target})) {
$_ .= $gluedj{$target}." ";
} elsif ($orig eq "" && length($target) >= 2 && $target =~ /^[a-zâäàéèêëïîöôüûùÿ]+$/ && !defined($lex{$target}) && defined($lex_final_s{$target."s"})) {
$_ .= "{$target◀s} ${target}s ";
} elsif ($orig eq "" && length($target) >= 2 && $target =~ /^[a-zâäàéèêëïîöôüûùÿ]+$/ &&!defined($lex{$target}) && defined($lex_final_t{$target."t"})) {
$_ .= "{$target◀t} ${target}t ";
} elsif ($orig eq "" && length($target) >= 2 && $target =~ /^[a-zâäàéèêëïîöôüûùÿ]+$/ &&!defined($lex{$target}) && defined($lex_final_e{$target."e"})) {
$_ .= "{$target◀e} ${target}e ";
} elsif ($orig eq "" && length($target) >= 2 && $target =~ /^[a-zâäàéèêëïîöôüûùÿ]+$/ &&!defined($lex{$target}) && $tmptarget =~ s/è/é/g && defined($lex{$tmptarget})) {
$_ .= "{$target◀èé} $tmptarget ";
} elsif ($orig eq "" && length($target) >= 2 && $target =~ /^[a-zâäàéèêëïîöôüûùÿ]+$/ &&!defined($lex{$target}) && $tmptarget =~ s/é$/ait/g && defined($lex{$tmptarget})) {
$_ .= "{$target◀éait} $tmptarget ";
} elsif ($orig eq "" && length($target) >= 2 && $target !~ /[{}]/ &&!defined($lex{$target}) && ($tmptarget =~ s/(^|[^w])([w\.])\2\2([^w]|$)/\1 \2 \2 \2 \3/g || 1)
&& $tmptarget =~ s/([^0-9\.])(?:\1){2,}/\1/g) {
$tmptarget =~ s/ ([.]) \1 \1 /\1\1\1/g;
if ($tmptarget =~ /^(.)(.)/ && $1 eq uc($2)) {
$tmptarget =~ s/^(.)./\1/;
}
$_ .= "{$target◀etir} $tmptarget ";
} elsif ($orig eq "" && length($target) >= 2 && $target =~ /^[a-zâäàéèêëïîöôüûùÿ]+$/ &&!defined($lex{$target}) && $tmptarget =~ /^(.*)k$/ && defined($lex{$1.'que'})) {
$tmptarget =~ s/k$/que/;
$_ .= "{$target◀kque} $tmptarget "; # on ne vérifie même pas que ce soit dans le lex
} elsif ($orig eq "" && length($target) >= 2 && $target =~ /^[a-zâäàéèêëïîöôüûùÿ]+$/ &&!defined($lex{$target}) && $target =~ /[aeé]men$/) {
$_ .= "{$target◀ment} ${target}t "; # on ne vérifie même pas que ce soit dans le lex
} else {
$_ .= $orig.$target." ";
}
} else {
$_ .= $orig.$target." ";
}
}
if ($is_maj_only) {
s/{([^}◀]+)/"{".uc($1)/ge;
s/^ *([^{} ]+)/" {".uc($1)."◀lc} ".$1/ge;
s/(?<=[^}]) ([^{} ]+)(?= )/" {".uc($1)."◀lc} ".$1/ge;
}
s/{([^}◀]+)(?:◀[^}]*)} \1 /\1 /g;
s/{([LDJSldsj])◀1} [LDJldsj]' +$/\1/;
s/ +$//;
s/^ +//;
s/◀[^}]*}/}/g; # à sauter si on veut garder les indicateurs de type de correction
print "$_\n";
}
print STDERR " NORMALIZER: Normalizing: done\n";
#!/usr/bin/perl
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
use utf8;
$| = 1;
$remove_non_standard_amalgams = 0;
$tag_amalgam_with_its_last_component_tag = 0;
$keep_token_form_distinction = 0;
$lang = "fr";
while (1) {
$_=shift;
if (/^$/) {last;}
elsif (/^-l(?:ang(?:age)?)?$/) {$lang=shift || die "Please provide a language code after -l option (en, fr)";}
elsif (/^-npp$/) {$no_post_process = 1}
elsif (/^-ktfd$/) {$keep_token_form_distinction = 1}
elsif (/^-rnsa$/) {$remove_non_standard_amalgams = 1}
elsif (/^-alct$/) {$tag_amalgam_with_its_last_component_tag = 1}
}
if ($lang eq "zzz" || $no_post_process) {
while (<>) {
s/^{([^}]+)} _XML\/[^ \n]+$/\1/;
if (/{/ && $keep_token_form_distinction) {
s/◁/\\{/g;
s/▷/\\}/g;
s/_ACC_O/\\{/g;
s/_ACC_F/\\}/g;
} else {
s/{([^}]*)} *[^ ]+(\/[^ \/]+)/replace_whitespaces_with_underscores($1).$2/ge;
s/◁/{/g;
s/▷/}/g;
s/_ACC_O/{/g;
s/_ACC_F/}/g;
}
s/_UNDERSCORE/_/g;
print $_;
}
exit 0;
}
while (<>) {
chomp;
s/^ +//;
s/ +$//;
$out = "";
s/ +/ /g;
# réalignement sur les tokens d'origine (premier pas)
s/^\s*{(.*?)} *_XML\/[^ ]+\s*$/${1}/;
if ($lang eq "en") {
s/(^| )vs\.\/[^ ]+/$1vs\.\/IN/g;
s/(^| )Vince\/[^ ]+/$1Vince\/NNP/g;
s/(^| )Thanks\/[^ ]+/$1Thanks\/NNS/g;
s/(^| )please\/[^ ]+/$1please\/UH/g;
s/(^| )Please\/[^ ]+/$1Please\/UH/g;
s/(^| )([AP]M)\/[^ ]+/$1$2\/NN/g;
while (s/{([^{}]+) ([^{} ]+)} ([^ \/{}]+)\/([^ \/]+)/{$1} ${3}\/GW {$2} ${3}\/$4/g) {}
s/(^| )>\/GW/\1>\/-RRB-/g;
s/(^| )<\/GW/\1<\/-LRB-/g;
s/({ *[^{} ]+ *})\s*_SMILEY\/[^ ]+/$1 _SMILEY\/NFP/g;
s/({ *[^{} ]+ [^{}]+}\s*)_SMILEY\/[^ ]+/$1 _SMILEY\/NFP/g;
s/_URL\/[^ ]+/_URL\/ADD/g;
s/_EMAIL\/[^ ]+/_EMAIL\/ADD/g;
s/_DATE[^ ]*\/[^ ]+/_EMAIL\/CD/g;
s/_(?:TIME|HEURE)\/[^ ]+/_EMAIL\/CD/g;
s/(^| )(l+o+l+|a+r+g+h+|a+h+a+|m+d+r+|p+t+d+r+)\/[^ ]+/$1$2\/NFP/gi; #|♥
s/(^| )([•·\*o])\/[^ ]+/$1$2\/:/g; #?
s/(^| )([^ {}]+\@[^ {}]{2,})\/[^ \/{}]+/\1\2\/ADD/g; # emails
s/(^| )([^ {}]+\.{com,org,net,pdf,docx?})\/[^ \/{}]+/\1\2\/ADD/g; # files
s/(^| )(http[^ {}]+\/[^ {}]+)\/[^ \/{}]+/\1\2\/ADD/g; # URLs
s/(^| )(www\.[^ {}]+)\/[^ \/{}]+/\1\2\/ADD/g; # URLs
s/(^| )([^ {}]+([=_\*-\~]{1,2})\3\3\3[^ {}]+)\/[^ \/{}]+/\1\2\/NFP/g;
s/(^| )(\|)\/[^ \/{}]+/\1\2\/NFP/g;
s/(^| )(s)\/[^ \/{}]+/\1\2\/AFX/g;
s/^([A-Z][^ {}]+)\/[^ \/{}]+ ([^ {}]+\/ADD)/\1\/GW \2/g; # !!!
s/^([A-Z][^ {}]+)\/[^ \/{}]+ ([A-Z])\/[^ \/{}]+ ([^ {}]+\/ADD)/\1\/GW \2\/GW \3/g; # !!!
s/^-\/[^ {}]+ ([A-Z][^ {}]+)\/[^ \/{}]+ ([^ {}]+\/ADD)/-\/NFP \1\/GW \2/g; # !!!
s/^-\/[^ {}]+ ([A-Z][^ {}]+)\/[^ \/{}]+ ([A-Z])\/[^ \/{}]+ ([^ {}]+\/ADD)/-\/NFP \1\/GW \2\/GW \3/g; # !!!
} elsif ($lang eq "fr") {
s/( je\/)[^ ]+/\1CLS/g;
s/^((?:{[^{} ]+} )?)tu\/[^ ]+/\1tu\/CLS/g;
s/( tu\/)[^ ]+ ((?:{[^{} ]+} )?[^ ]+\/VS?)/\1CLS \2/g;
s/({ *[^{} ]+ *})\s*_SMILEY\/[^ ]+/$1 _SMILEY\/I/g;
s/({ *[^{} ]+ [^{}]+})\s*_SMILEY\/[^ ]+/$1 _SMILEY\/X/g;
s/^([0-9\.]+)\/[^ ]+$/\1\/META/;
s/^([0-9\.]+)\/[^ ]+ \.\/[^ ]+$/\1\/META \.\/META/;
s/({\#[^{} ]+}) _URL\/[^ ]+/\1 _URL\/KK/g;
s/({[^\#][^{} ]*}) _URL\/[^ ]+/\1 _URL\/NPP/g;
# s/_URL\/[^ ]+/_URL\/NPP/g;
s/_EMAIL\/[^ ]+/_EMAIL\/NPP/g;
s/(^| )(l+o+l+|a+r+g+h+|a+h+a+|♥)\/[^ ]+/$1$2\/I/gi;
s/(^| )([•·\*o]|\.+)\/[^ ]+/$1$2\/PONCT/g;
s/(^| )(Like|Share)\/[^ ]+/$1$2\/ET/g;
s/(^|$)([^ ]+)\/[^ ]+ (at)\/[^ ]+ (\d+)\/[^ ]+ (:)\/[^ ]+ (\d+(?:[ap]m)?)\/[^ ]+/$1$2\/ADV $3\/P $4\/DET $5\/PONCT $6\/DET/g;
s/(^|$)(\d+)\/[^ ]+ (people)\/[^ ]+ (like)\/[^ ]+ (this)\/[^ ]+/$1$2\/DET $3\/NC $4\/V $5\/PRO/g;
s/(^|$)(\d+)\/[^ ]+ (hours|minutes|seconds)\/[^ ]+ (ago)\/[^ ]+/$1$2\/DET $3\/NC $4\/ADV/g;
s/(^|$)(love)\/[^ ]+ (u|you)\/[^ ]+/$1$2\/V $3\/PRO/g;
# pour smsalpes
s/(^| )\*\/[^ ]+ \*\/[^ ]+ \*\/[^ ]+ ([A-Z]+)\/[^ ]+ (?:{_} _UNDERSCORE|_)\/[^ ]+ ([0-9]+)\/[^ ]+ \*\/[^ ]+ \*\/[^ ]+ \*\/[^ ]+( |$)/$1***$2_$3***\/NPP$4/g;
s/(^| )\*\/[^ ]+ \*\/[^ ]+ \*\/[^ ]+ ([A-Z]+)\/[^ ]+ (?:{_} _UNDERSCORE|_)\/[^ ]+ ([0-9]+)\/[^ ]+ (?:{_} _UNDERSCORE|_)\/[^ ]+ ([0-9]+)\/[^ ]+ \*\/[^ ]+ \*\/[^ ]+ \*\/[^ ]+( |$)/$1***$2_$3_$4***\/NPP$5/g;
s/(^| )\*\/[^ ]+ \*\/[^ ]+ \*\/[^ ]+ {([A-Z]+)} [^ ]+\/[^ ]+ (?:{_} _UNDERSCORE|_)\/[^ ]+ ([0-9]+)\/[^ ]+ \*\/[^ ]+ \*\/[^ ]+ \*\/[^ ]+( |$)/$1***$2_$3***\/NPP$4/g;
s/(^| )\*\/[^ ]+ \*\/[^ ]+ \*\/[^ ]+ {([A-Z]+)} [^ ]+\/[^ ]+ (?:{_} _UNDERSCORE|_)\/[^ ]+ ([0-9]+)\/[^ ]+ (?:{_} _UNDERSCORE|_)\/[^ ]+ ([0-9]+)\/[^ ]+ \*\/[^ ]+ \*\/[^ ]+ \*\/[^ ]+( |$)/$1***$2_$3_$4***\/NPP$5/g;
}
s/}_/} _/g;
$out = "";
# réalignement sur les tokens d'origine
while ($_ ne "") {
if (s/^{([^ {}]+)} ([^ {}]+(?: \{\} *[^ {}]+)+)( |$)//) {
$t = $1;
$f = $2;
$f =~ s/^[^ ]*\///;
$f =~ s/ {} [^ ]*\//+/g;
$t =~ s/^(.*)◀.*/\1/;
if ($f =~ /\+/) {
if ($remove_non_standard_amalgams && $f ne "P+D" && $f ne "P+PRO") {
$f = "X";
} elsif ($tag_amalgam_with_its_last_component_tag) {
$f =~ s/^.*\+//;
}
}
$out .= " $t/$f";
} elsif (s/^{([^ {}]+(?: [^{}]+)+)} ([^ {}]+)\/([^ {}\/]+)( |$)//) {
$t = $1;
$f = $2;
$tag = $3;
$t =~ s/^(.*)◀.*/\1/;
if ($remove_non_standard_amalgams) {
$t =~ s/ /\/Y /g;
$out .= " $t/Y";
} else {
if ($lang eq "fr") {
$t =~ s/ /\/Y /g;
} else {
$t =~ s/ /\/GW /g;
}
$out .= " $t/$tag";
}
} elsif (s/^{([^ {}]+)} ([^ {}]+)( |$)//) {
$t = $1;
$f = $2;
$t =~ s/^(.*)◀.*/\1/;
$f =~ s/^.*\///;
$out .= " $t/$f";
} elsif (s/^([^{} ]+)( |$)//) {
$out .= " $1";
} else {
die $_;
}
s/^ *//;
}
$out =~ s/◁/{/g;
$out =~ s/▷/}/g;
$out =~ s/^ +//;
$out =~ s/ +$//;
print $out."\n";
}
sub replace_whitespaces_with_underscores {
my $s = shift;
$s =~ s/ /_/g;
return $s;
}
This diff is collapsed.
This diff is collapsed.
xxxxxxxxxxxx.
inf..
prc..
sq..
sqq..
suiv..
sup..
N.B..
d'abord (...ensuite)
d'un ct (...d'un autre ct)
d'une part (...d'autre part)
e.a..
e.g.
et al..
i.e.
ibid..
id..
loc. cit.
op. cit..
q.e.d..
une mdaille d'argent aux J.O.
une mdaille d'or aux J.O.
une mdaille de bronze aux J.O.
c.-.-d.
i.e.
O.K..
B.D
B.D..
C.V..
M.
MM.
Mr.
O.N.U.
P.S..
Q.G..
R.P.
S.A.
S.A..
S.A.R.L.
S.F..
S.O.S..
Tel..
Tl..
adj..
adv..
art.
bibliogr..
boul.
bull.
cap.
ch.
chap.
coll.
collec.
dept.
dir.
dp.
ex..
fasc.
fig.
hab..
ill.
intr..
introd..
ital..
math..
ms.
obs..
p.
p.-s..
paragr.
pl..
pp.
rf..
rd.
s.f
s.f..
sp..
spp..
t.
tel..
trad..
tl..
v.
v.
var.
vol.
zool..
d.
dit..
tym..
Ch.
George W. Bush
J.-C
J.-C.
J.O.
N.-D.
O.N.U.
St. George's
St. John's
Th.
Th.
U.E.
U.R.S.S.
U.S.A.
B.A.ba
O.P.A.
O.P.A.
Q.I.
k.o.
marques de N(sent.)
rosette d'off. de la L.d'h.
(...)
.
...
...
....
[...]
etc.
apr.
av.
cf.
conf.
vs.
apr.
av.
cf.
conf.
vs.
#_error comme verbe est dans v.ilex
# PLY package
# Author: David Beazley (dave@dabeaz.com)
__all__ = ['lex','yacc']
This diff is collapsed.
This diff is collapsed.
# ----------------------------------------------------------------------
# ctokens.py
#
# Token specifications for symbols in ANSI C and C++. This file is
# meant to be used as a library in other tokenizers.
# ----------------------------------------------------------------------
# Reserved words
tokens = [
# Literals (identifier, integer constant, float constant, string constant, char const)
'ID', 'TYPEID', 'ICONST', 'FCONST', 'SCONST', 'CCONST',
# Operators (+,-,*,/,%,|,&,~,^,<<,>>, ||, &&, !, <, <=, >, >=, ==, !=)
'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
'LOR', 'LAND', 'LNOT',
'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
# Assignment (=, *=, /=, %=, +=, -=, <<=, >>=, &=, ^=, |=)
'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 'PLUSEQUAL', 'MINUSEQUAL',
'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 'OREQUAL',
# Increment/decrement (++,--)
'PLUSPLUS', 'MINUSMINUS',
# Structure dereference (->)
'ARROW',
# Ternary operator (?)
'TERNARY',
# Delimeters ( ) [ ] { } , . ; :
'LPAREN', 'RPAREN',
'LBRACKET', 'RBRACKET',
'LBRACE', 'RBRACE',
'COMMA', 'PERIOD', 'SEMI', 'COLON',
# Ellipsis (...)
'ELLIPSIS',
]
# Operators
t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMES = r'\*'
t_DIVIDE = r'/'
t_MODULO = r'%'
t_OR = r'\|'
t_AND = r'&'
t_NOT = r'~'
t_XOR = r'\^'
t_LSHIFT = r'<<'
t_RSHIFT = r'>>'
t_LOR = r'\|\|'
t_LAND = r'&&'
t_LNOT = r'!'
t_LT = r'<'
t_GT = r'>'
t_LE = r'<='
t_GE = r'>='
t_EQ = r'=='
t_NE = r'!='
# Assignment operators
t_EQUALS = r'='
t_TIMESEQUAL = r'\*='
t_DIVEQUAL = r'/='
t_MODEQUAL = r'%='
t_PLUSEQUAL = r'\+='
t_MINUSEQUAL = r'-='
t_LSHIFTEQUAL = r'<<='
t_RSHIFTEQUAL = r'>>='
t_ANDEQUAL = r'&='
t_OREQUAL = r'\|='
t_XOREQUAL = r'^='
# Increment/decrement
t_INCREMENT = r'\+\+'
t_DECREMENT = r'--'
# ->
t_ARROW = r'->'
# ?
t_TERNARY = r'\?'
# Delimeters
t_LPAREN = r'\('
t_RPAREN = r'\)'
t_LBRACKET = r'\['
t_RBRACKET = r'\]'
t_LBRACE = r'\{'
t_RBRACE = r'\}'
t_COMMA = r','
t_PERIOD = r'\.'
t_SEMI = r';'
t_COLON = r':'
t_ELLIPSIS = r'\.\.\.'
# Identifiers
t_ID = r'[A-Za-z_][A-Za-z0-9_]*'
# Integer literal
t_INTEGER = r'\d+([uU]|[lL]|[uU][lL]|[lL][uU])?'
# Floating literal
t_FLOAT = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?'
# String literal
t_STRING = r'\"([^\\\n]|(\\.))*?\"'
# Character constant 'c' or L'c'
t_CHARACTER = r'(L)?\'([^\\\n]|(\\.))*?\''
# Comment (C-Style)
def t_COMMENT(t):
r'/\*(.|\n)*?\*/'
t.lexer.lineno += t.value.count('\n')
return t
# Comment (C++-Style)
def t_CPPCOMMENT(t):
r'//.*\n'
t.lexer.lineno += 1
return t
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment