Hacking languages info + adding corpus.skipped_docs in parsing

235afd9c · c24b · ee16eff6 · 235afd9c
Commit 235afd9c authored Jul 27, 2016 by c24b
Hide whitespace changes
Inline Side-by-side

Showing with 40 additions and 29 deletions

parsing.py gargantext/util/toolchain/parsing.py +40 -29

No files found.
--- a/gargantext/util/toolchain/parsing.py
+++ b/gargantext/util/toolchain/parsing.py
@@ -8,44 +8,55 @@ from re          import sub
 def parse(corpus):
    try:
        documents_count = 0
-
        corpus.status('Docs', progress=0)
-
-        # will gather info about languages
-        observed_languages = defaultdict(int)
-
-        # retrieve resource information
-        for resource in corpus.resources():
-            # information about the resource
-            if resource['extracted']:
-                continue
-            #source store available module for a resource
-            source = get_resource(resource["type"])
-            resource_parser = load_parser(source)
-            resource_path = resource['path']
-            # extract and insert documents from corpus resource into database
-            for hyperdata in resource_parser(resource_path):
-
-                # uniformize the text values for easier POStagging and processing
-                for k in ['abstract', 'title']:
-                    if k in hyperdata:
-                        try :
-                            hyperdata[k] = normalize_chars(hyperdata[k])
-                        except Exception as error :
-                            print("Error normalize_chars", error)
-
+        #get the sources capabilities for a given corpus
+        sources = [get_resource(resource["type"]) if not resource.has_attribute('extracted') for resource in corpus.resources()]
+        if len(sources) == 0:
+            #>>> documents have already been parsed?????
+            return
+        if len(sources) > 0:
+            #>>> necessairement 1 corpus = 1 source dans l'archi actuelle
+            source = sources[0]
+            if resource["parser"] is None:
+                #corpus.status(error)
+                raise ValueError("Resource '%s' has no Parser" %resource["name"])
+            else:
+                corpus.languages = defaultdict.from_keys(sources[0]["default_languages"], 0)
+                corpus.skipped_docs = []
+                #load the corresponding parser
+                resource_parser = load_parser(source)
+                skipped_languages = []
+                # extract and insert documents from resource.path into database
+                for hyperdata in resource_parser(resource["path"]):
+                    # indexed text fields defined in constants
+                    for k in DEFAULT_INDEX_FIELDS:
+                        if hyperdata.has_attribute(k):
+                            try:
+                                hyperdata[k] = normalize_chars(hyperdata[k])
+                            except Exception as error :
+                                hyperdata["error"] = "Error normalize_chars"
+
+                    # a simple census to raise language info at corpus level
+                    if hyperdata.has_key("language_iso2"):
+                        try:
+                            corpus.languages[hyperdata["language_iso2"]] += 1
+                        except KeyError:
+                            hyperdata["error"] = "Error: unsupported language"
+                            skipped_languages.append(hyperdata["language_iso2"])
                # save as DB child
                # ----------------
+
+
                document = corpus.add_child(
                    typename = 'DOCUMENT',
                    name = hyperdata.get('title', '')[:255],
                    hyperdata = hyperdata,
                )
                session.add(document)
+                if document.hyperdata.has_key("error"):
+                    #document.status("error")
+                    corpus.skipped_docs.append(document.id)

-                # a simple census to raise language info at corpus level
-                if "language_iso2" in hyperdata:
-                    observed_languages[hyperdata["language_iso2"]] += 1

                # logging
                if documents_count % BATCH_PARSING_SIZE == 0:
@@ -58,7 +69,7 @@ def parse(corpus):
        # add a corpus-level info about languages...
        corpus.hyperdata['languages'] = observed_languages
        # ...with a special key inside for skipped languages at ngrams_extraction
-        corpus.hyperdata['languages']['__skipped__'] = {}
+        corpus.hyperdata['languages']['__skipped__'] = Counter(skipped_languages)
        # commit all changes
        corpus.status('Docs', progress=documents_count, complete=True)
        corpus.save_hyperdata()