fix quotes simplification in normalize_chars and space management in normalize_terms

e6f742b0 · Romain Loth · 3b46af20 · e6f742b0 · e6f742b0
Commit e6f742b0 authored Jun 29, 2016 by Romain Loth
Show whitespace changes
Inline Side-by-side

Showing with 11 additions and 5 deletions

ngrams_extraction.py gargantext/util/toolchain/ngrams_extraction.py +5 -2

parsing.py gargantext/util/toolchain/parsing.py +6 -3

No files found.
--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -127,12 +127,15 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
    (benefits from normalize_chars upstream so there's less cases to consider)
    """
-    term_str = sub(r'^[-",;/%(){}\\\[\]\.\']+', '', term_str)
+    # print('normalize_terms  IN: "%s"' % term_str)
-    term_str = sub(r'[-",;/%(){}\\\[\]\.\']+$', '', term_str)
+    term_str = sub(r'^[-",;/%(){}\\\[\]\.\' ]+', '', term_str)
+    term_str = sub(r'[-",;/%(){}\\\[\]\.\' ]+$', '', term_str)
    if do_lowercase:
        term_str = term_str.lower()
+    # print('normalize_terms OUT: "%s"' % term_str)
    return term_str

--- a/gargantext/util/toolchain/parsing.py
+++ b/gargantext/util/toolchain/parsing.py
@@ -86,6 +86,7 @@ def normalize_chars(my_str):
       ou passer en lowercase, seront à placer plutôt *après* le tagger,
            cf. toolchain.ngrams_extraction.normalize_terms)
    """
+    # print('normalize_chars  IN: "%s"' % my_str)
    # --------------
    # E S P A C E S
    # --------------
@@ -121,12 +122,12 @@ def normalize_chars(my_str):
    # Guillemets
    # ----------
    # la plupart des quotes simples --> ' APOSTROPHE
-    my_str = sub(r"‘’‚`‛", "'", my_str) # U+2018 U+2019 U+201a U+201b
+    my_str = sub(r"[‘’‚`‛]", "'", my_str) # U+2018 U+2019 U+201a U+201b
    my_str = sub(r'‹ ?',"'", my_str)    # U+2039 plus espace éventuel après
    my_str = sub(r' ?›',"'", my_str)    # U+203A plus espace éventuel avant
    # la plupart des quotes doubles --> " QUOTATION MARK
-    my_str = sub(r'“”„‟', '"', my_str)  # U+201C U+201D U+201E U+201F
+    my_str = sub(r'[“”„‟]', '"', my_str)  # U+201C U+201D U+201E U+201F
    my_str = sub(r'« ?', '"', my_str)   # U+20AB plus espace éventuel après
    my_str = sub(r' ?»', '"', my_str)   # U+20AB plus espace éventuel avant
@@ -178,4 +179,6 @@ def normalize_chars(my_str):
    my_str = sub(r'Ꜩ', 'Tz', my_str)
    my_str = sub(r'ꜩ', 'tz', my_str)
+    # print('normalize_chars OUT: "%s"' % my_str)
    return my_str