fix case when languages are recognized through iso3 but have no code in iso2

7002c9e3 · Romain Loth · 912d76f9 · 7002c9e3 · 7002c9e3 · 7002c9e3
Commit 7002c9e3 authored May 17, 2016 by Romain Loth
Showing with 24 additions and 3 deletions

_Parser.py gargantext/util/parsers/_Parser.py +19 -2

__init__.py gargantext/util/toolchain/__init__.py +4 -0

ngrams_extraction.py gargantext/util/toolchain/ngrams_extraction.py +1 -1

No files found.
--- a/gargantext/util/parsers/_Parser.py
+++ b/gargantext/util/parsers/_Parser.py
@@ -126,10 +126,27 @@ class Parser:
                        break
                except KeyError:
                    language_keyerrors[key] = language_symbol
+        # languages can find Language objects from any code iso2 or iso3
+        # --------------------------------------------------------------
+        # > languages['fr']
+        # <Language iso3="fra" iso2="fr" implemented="True" name="French">
+        # > languages['fra']
+        # <Language iso3="fra" iso2="fr" implemented="True" name="French">
        if language is not None:
-            hyperdata['language_iso2'] = language.iso2
-            hyperdata['language_iso3'] = language.iso3
            hyperdata['language_name'] = language.name
+            hyperdata['language_iso3'] = language.iso3
+            if (language.iso2 is not None):
+                # NB: language can be recognized through iso3 but have no iso2!!
+                #     because there's *more* languages in iso3 codes (iso-639-3)
+                # exemple:
+                # > languages['dnj']
+                # <Language iso3="dnj" iso2="None" implemented="False" name="Dan">
+                #                            ----
+                hyperdata['language_iso2'] = language.iso2
+            else:
+                # 'None' would become json 'null'  ==> "__unknown__" more stable
+                hyperdata['language_iso2'] = "__unknown__"
        elif language_keyerrors:
            print('Unrecognized language: %s' % ', '.join(
                '%s="%s"' % (key, value) for key, value in language_keyerrors.items()

--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
@@ -54,6 +54,10 @@ def parse_extract_indexhyperdata(corpus):
    corpus.status('Workflow', progress=1)
    corpus.save_hyperdata()
    session.commit()
+    # FIXME: 'Workflow' will still be uncomplete when 'Index' and 'Lists' will
+    #        get stacked into hyperdata['statuses'], but doing corpus.status()
+    #        will return only the 1st uncomplete action (corpus.status() doesn't
+    #        understand "subactions")
    # apply actions
    print('CORPUS #%d' % (corpus.id))

--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -56,7 +56,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', )):
                ngramsextractor = ngramsextractors[language_iso2]
            except KeyError:
                # skip document
-                print('Unsupported language: `%s`' % (language_iso2, ))
+                print('Unsupported language: `%s` (doc #%i)' % (language_iso2, document.id))
                # and remember that for later processes (eg stemming)
                document.hyperdata['__skipped__'] = 'ngrams_extraction'
                document.save_hyperdata()