Commit e6f742b0 authored by Romain Loth's avatar Romain Loth

fix quotes simplification in normalize_chars and space management in normalize_terms

parent 3b46af20
...@@ -127,12 +127,15 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG): ...@@ -127,12 +127,15 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
(benefits from normalize_chars upstream so there's less cases to consider) (benefits from normalize_chars upstream so there's less cases to consider)
""" """
term_str = sub(r'^[-",;/%(){}\\\[\]\.\']+', '', term_str) # print('normalize_terms IN: "%s"' % term_str)
term_str = sub(r'[-",;/%(){}\\\[\]\.\']+$', '', term_str) term_str = sub(r'^[-",;/%(){}\\\[\]\.\' ]+', '', term_str)
term_str = sub(r'[-",;/%(){}\\\[\]\.\' ]+$', '', term_str)
if do_lowercase: if do_lowercase:
term_str = term_str.lower() term_str = term_str.lower()
# print('normalize_terms OUT: "%s"' % term_str)
return term_str return term_str
......
...@@ -86,6 +86,7 @@ def normalize_chars(my_str): ...@@ -86,6 +86,7 @@ def normalize_chars(my_str):
ou passer en lowercase, seront à placer plutôt *après* le tagger, ou passer en lowercase, seront à placer plutôt *après* le tagger,
cf. toolchain.ngrams_extraction.normalize_terms) cf. toolchain.ngrams_extraction.normalize_terms)
""" """
# print('normalize_chars IN: "%s"' % my_str)
# -------------- # --------------
# E S P A C E S # E S P A C E S
# -------------- # --------------
...@@ -121,12 +122,12 @@ def normalize_chars(my_str): ...@@ -121,12 +122,12 @@ def normalize_chars(my_str):
# Guillemets # Guillemets
# ---------- # ----------
# la plupart des quotes simples --> ' APOSTROPHE # la plupart des quotes simples --> ' APOSTROPHE
my_str = sub(r"‘’‚`‛", "'", my_str) # U+2018 U+2019 U+201a U+201b my_str = sub(r"[‘’‚`‛]", "'", my_str) # U+2018 U+2019 U+201a U+201b
my_str = sub(r'‹ ?',"'", my_str) # U+2039 plus espace éventuel après my_str = sub(r'‹ ?',"'", my_str) # U+2039 plus espace éventuel après
my_str = sub(r' ?›',"'", my_str) # U+203A plus espace éventuel avant my_str = sub(r' ?›',"'", my_str) # U+203A plus espace éventuel avant
# la plupart des quotes doubles --> " QUOTATION MARK # la plupart des quotes doubles --> " QUOTATION MARK
my_str = sub(r'“”„‟', '"', my_str) # U+201C U+201D U+201E U+201F my_str = sub(r'[“”„‟]', '"', my_str) # U+201C U+201D U+201E U+201F
my_str = sub(r'« ?', '"', my_str) # U+20AB plus espace éventuel après my_str = sub(r'« ?', '"', my_str) # U+20AB plus espace éventuel après
my_str = sub(r' ?»', '"', my_str) # U+20AB plus espace éventuel avant my_str = sub(r' ?»', '"', my_str) # U+20AB plus espace éventuel avant
...@@ -178,4 +179,6 @@ def normalize_chars(my_str): ...@@ -178,4 +179,6 @@ def normalize_chars(my_str):
my_str = sub(r'Ꜩ', 'Tz', my_str) my_str = sub(r'Ꜩ', 'Tz', my_str)
my_str = sub(r'ꜩ', 'tz', my_str) my_str = sub(r'ꜩ', 'tz', my_str)
# print('normalize_chars OUT: "%s"' % my_str)
return my_str return my_str
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment