Commit b5ff7a7f authored by Romain Loth's avatar Romain Loth

[feat] add some more or less uninformative expressions from academic genres to...

[feat] add some more or less uninformative expressions from academic genres to the base stoplist (bug-17)
parent 5fc55d30
......@@ -27,7 +27,6 @@ def is_stop_word(ngram, stop_words=None):
# , "(.*)(\.)(.*)" trop fort (enlève les sigles !)
, "(.*)(\,)(.*)"
, "(.*)(< ?/?p ?>)(.*)" # marques de paragraphes
, "(.*)(study|elsevier)(.*)"
, "(.*)\b(xx|xi|xv)\b(.*)"
, "(.*)(result)(.*)"
, "(.*)(year|année|nombre|moitié)(.*)"
......@@ -37,6 +36,87 @@ def is_stop_word(ngram, stop_words=None):
, "(.*)(terme)(.*)"
, "(.*)(différent)(.*)"
, "(.*)(travers)(.*)"
# academic stamps
, ".*\belsevier\b.*"
, ".*\bwiley\b.*)"
, ".*\bspringer\b.*"
, ".*university press\b.*"
, ".*\bstudy\b.*"
# academic terms when alone ~~> usually not informative
, "hypothes[ie]s$"
, "analys[ie]s$"
, "bas[ie]s$"
, "online$"
, "importance$"
, "uses?$"
, "cases?$"
, "effects?$"
, "times?$"
, "methods?$"
, "types?$"
, "evidences?$"
, "findings?$"
, "relations?$"
, "terms?$"
, "procedures?$"
, "factors?$"
, "reports?$"
, "changes?$"
, "facts?$"
, "others?$"
, "applications?$"
, "periods?$"
, "investigations?$"
, "orders?$"
, "forms?$"
, "conditions?$"
, "situations?$"
, "papers?$"
, "relationships?$"
, "values?$"
, "areas?$"
, "techniques?$"
, "means?$"
, "conclusions?$"
, "comparisons?$"
, "parts?$"
, "amounts?$"
, "aims?$"
, "lacks?$"
, "issues?$"
, "ways?$"
, "ranges?$"
, "models?$"
, "articles?$"
, "series?$"
, "totals?$"
, "influences?$"
, "journals?$"
, "rules?$"
, "persons?$"
, "abstracts?$"
, "(?:book)? reviews?$"
, "process(?:es)?$"
, "approach(?:es)?$"
, "theor(?:y|ies)?$"
, "methodolog(?:y|ies)?$"
, "similarit(?:y|ies)?$"
, "possibilit(?:y|ies)?$"
, "stud(?:y|ies)?$"
# non-thematic or non-NP expressions
, "none$"
, "other(?: hand)?$"
, "whereas$"
, "usually$"
, "and$"
# , "vol$"
, "eds?$"
, "ltd$"
, "copyright$"
, "e-?mails?$"
, ".*="
, "=.*"
, "further(?:more)?$"
, "(.*)(:|\|)(.*)"
] :
compiled_regexes.append(compile(regex))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment