Merge branch 'mat' of ssh://delanoe.org:1979/gargantext

6c4a607c · Mathieu Rodic · 75a84f95 · 2ca5116a · 6c4a607c · 6c4a607c
Commit 6c4a607c authored Oct 22, 2014 by Mathieu Rodic
19 changed files
--- a/.ipynb_checkpoints/Languages integration (only run once!)-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Languages integration (only run once!)-checkpoint.ipynb
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:7c80ed9f4b088e13444efb451a1ee46e5727247be14aaf30ddf0236a49ac461b"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": []
+}
\ No newline at end of file
--- a/.ipynb_checkpoints/Untitled0-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Untitled0-checkpoint.ipynb
 {
 "metadata": {
  "name": "",
-  "signature": "sha256:490e1bc5ac44087c1b3f82ca74e40f42f49bd3910f79a088af19c708d73c63e0"
+  "signature": "sha256:a5146fbde2b6bf2e3ed4e2bdddfb62662f99272f26e82bf86110680ff3595332"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
- "worksheets": []
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from node.models import Node, NodeType, Language\n",
+      "import parsing\n",
+      "from parsing.FileParsers import *"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "node = Node.objects.get(name=\"PubMed corpus\")"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "fileparser = PubmedFileParser.PubmedFileParser(file='/var/www/gargantext/media/' + node.fichier.name)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "fileparser.parse(node)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "ename": "ValueError",
+       "evalue": "Cannot assign \"24\": \"Node.user\" must be a \"User\" instance.",
+       "output_type": "pyerr",
+       "traceback": [
+        "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+        "\u001b[1;32m<ipython-input-4-8c1443001599>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mfileparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnode\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+        "\u001b[1;32m/srv/gargantext/parsing/FileParsers/PubmedFileParser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(self, parentNode, tag)\u001b[0m\n\u001b[0;32m     45\u001b[0m                             \u001b[0mlanguage\u001b[0m    \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_languages_iso3\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mmetadata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"language_iso3\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     46\u001b[0m                             \u001b[0mmetadata\u001b[0m    \u001b[1;33m=\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 47\u001b[1;33m                             \u001b[0mguid\u001b[0m        \u001b[1;33m=\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"doi\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     48\u001b[0m                         )\n\u001b[0;32m     49\u001b[0m                         \u001b[1;32mif\u001b[0m \u001b[0mdocument\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+        "\u001b[1;32m/srv/gargantext/parsing/FileParsers/FileParser.py\u001b[0m in \u001b[0;36mcreate_document\u001b[1;34m(self, parentNode, title, contents, language, metadata, guid)\u001b[0m\n\u001b[0;32m    100\u001b[0m             \u001b[0mmetadata\u001b[0m    \u001b[1;33m=\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    101\u001b[0m             \u001b[1;31m#resource    = resource,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 102\u001b[1;33m             \u001b[0mparent\u001b[0m      \u001b[1;33m=\u001b[0m \u001b[0mparentNode\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    103\u001b[0m         )\n\u001b[0;32m    104\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+        "\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/models/base.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m    403\u001b[0m                 \u001b[1;31m# \"user_id\") so that the object gets properly cached (and type\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    404\u001b[0m                 \u001b[1;31m# checked) by the RelatedObjectDescriptor.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 405\u001b[1;33m                 \u001b[0msetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfield\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrel_obj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    406\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    407\u001b[0m                 \u001b[0msetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfield\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mattname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mval\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+        "\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/models/fields/related.py\u001b[0m in \u001b[0;36m__set__\u001b[1;34m(self, instance, value)\u001b[0m\n\u001b[0;32m    337\u001b[0m             raise ValueError('Cannot assign \"%r\": \"%s.%s\" must be a \"%s\" instance.' %\n\u001b[0;32m    338\u001b[0m                                 (value, instance._meta.object_name,\n\u001b[1;32m--> 339\u001b[1;33m                                  self.field.name, self.field.rel.to._meta.object_name))\n\u001b[0m\u001b[0;32m    340\u001b[0m         \u001b[1;32melif\u001b[0m \u001b[0mvalue\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    341\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0minstance\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_state\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdb\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+        "\u001b[1;31mValueError\u001b[0m: Cannot assign \"24\": \"Node.user\" must be a \"User\" instance."
+       ]
+      }
+     ],
+     "prompt_number": 4
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "node.children.all()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    }
+   ],
+   "metadata": {}
+  }
+ ]
 }
\ No newline at end of file
--- a/.ipynb_checkpoints/test_parser_ngramextractor (Mat)-checkpoint.ipynb
+++ b/.ipynb_checkpoints/test_parser_ngramextractor (Mat)-checkpoint.ipynb
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:e0c3b2efe7c205a29dc4e028b10ffb7b9d0569f35c4b426febdf523069abffdb"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from pprint import pprint\n",
+      "from node.models import Node, NodeType, Language, Ngram\n",
+      "from django.contrib.auth.models import User\n",
+      "import parsing\n",
+      "from parsing.FileParsers import *"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "# Define user\n",
+      "try:\n",
+      "    user = User.objects.get(username='Mat')\n",
+      "except:\n",
+      "    user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
+      "    user.save()\n",
+      "\n",
+      "# Define document types\n",
+      "nodetypes = {}\n",
+      "for name in ['Corpus', 'Document']:\n",
+      "    try:\n",
+      "        nodetypes[name] = NodeType.objects.get(name=name)\n",
+      "    except:\n",
+      "        nodetypes[name] = NodeType(name=name)\n",
+      "        nodetypes[name].save()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "Node.objects.all().delete()\n",
+      "corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
+      "corpus.save()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "fileparser.parse(corpus)\n",
+      "print('Ok!')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "Warning: parsing empty text\n",
+        "Warning: parsing empty text\n",
+        "Warning: parsing empty text"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Warning: parsing empty text\n",
+        "Warning: parsing empty text"
+       ]
+      }
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "for node_ngram in corpus.children.first().node_ngram_set.all():\n",
+      "    print(node_ngram.ngram.terms)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file
--- a/Untitled0.ipynb
+++ b/Untitled0.ipynb
--- a/gargantext_web/settings.py
+++ b/gargantext_web/settings.py
@@ -23,9 +23,9 @@ PROJECT_PATH = os.path.abspath(PROJECT_PATH)
 SECRET_KEY = 'bt)3n9v&a02cu7^^=+u_t2tmn8ex5fvx8$x4r*j*pb1yawd+rz'

 # SECURITY WARNING: don't run with debug turned on in production!
-DEBUG = False
+DEBUG = True

-TEMPLATE_DEBUG = False
+TEMPLATE_DEBUG = True


 TEMPLATE_DIRS = (

--- a/init/README.rst
+++ b/init/README.rst
+Install the requirements
+------------------------
+
+1)  Install all the Debian packages listed in dependances.deb
+    (also: sudo apt-get install postgresql-contrib)
+
+2)  Create a virtual enironnement with pyvenv: apt-get install python-virtualenv
+
+3)  Type: source [your virtual environment directory]/bin/activate
+
+4)  Do your work!
+
+5)  Type: deactivate
+
+
+Configure stuff
+---------------
+
+1)  ln -s [the project folder] /srv/gargantext
+
+2)  ln -s [your folder for tree tagger] [the project folder]/parsing/Tagger/treetagger
+
+Warning: for ln, path has to be absolute!
+
+
+In PostreSQL
+-------------
+
+1)  Ensure postgres is started: sudo /etc/init.d/postgresql start
+
+2)  sudo su postgres
+
+3)  psql
+
+4)  CREATE USER alexandre WITH PASSWORD 'C8kdcUrAQy66U';
+    (see gargantext_web/settings.py, DATABASES = { ... })
+    
+5)  CREATE DATABASE gargandb WITH OWNER alexandre;
+
+6)  Ctrl + D
+
+7)  psql gargandb
+
+6)  CREATE EXTENSION hstore;
+
+7)  Ctrl + D
+
+
+Populate the database
+---------------------
+
+python manage.py syncdb
+
+
+Start the Python Notebook server
+--------------------------------
+
+1)  In Pyvenv: python manage.py shell_plus --notebook
+
+2)  Work from your browser!
+
+
+Start the Django server
+-----------------------
+
+python manage.py runserver
\ No newline at end of file
--- a/init/init.sh
+++ b/init/init.sh
@@ -2,7 +2,6 @@

 psql -d gargandb -f init.sql

-
 sleep 2
 ./manage.py syncdb


--- a/parsing/FileParsers/FileParser.py
+++ b/parsing/FileParsers/FileParser.py
 import collections
+from node.models import Node, NodeType, Language, Ngram, Node_Ngram
+from parsing.NgramsExtractors import *

-
-# This allows the fast retrieval of ngram ids
-# from the cache instead of using the database for every call
 class NgramCache:
+    """
+    This allows the fast retrieval of ngram ids
+    from the cache instead of using the database for every call
+    """
    
    def __init__(self, language):
        self._cache = dict()
@@ -13,9 +16,9 @@ class NgramCache:
        terms = terms.strip().lower()
        if terms not in self._cache:
            try:
-                ngram = NGram.get(terms=terms, language=self._language)
+                ngram = Ngram.get(terms=terms, language=self._language)
            except:
-                ngram = NGram(terms=terms, n=len(terms), language=self._language)
+                ngram = Ngram(terms=terms, n=len(terms), language=self._language)
                ngram.save()
            self._cache[terms] = ngram
        return self._cache[terms]
@@ -43,12 +46,11 @@ class FileParser:
        self._ngramcaches = NgramCaches()
        # extractors
        self._extractors = dict()
-        self._document_nodetype = NodeType.get(name='Document')
-        with Language.objects.all() as languages:
-            self._languages_iso2 = {language.iso2.lower(): language for language in Language}
-            self._languages_iso3 = {language.iso3.lower(): language for language in Language}
-        # ...and parse!
-        self.parse()
+        self._document_nodetype = NodeType.objects.get(name='Document')
+        languages = Language.objects.all()
+        self._languages_iso2 = {language.iso2.lower(): language for language in languages}
+        self._languages_iso3 = {language.iso3.lower(): language for language in languages}
+        #self.parse()
    
    """Extract the ngrams from a given text.
    """
@@ -65,45 +67,54 @@ class FileParser:
            extractor = self._extractors[language]
        # Extract the ngrams
        if extractor:
+            tokens = []
+            for ngram in extractor.extract_ngrams(text):
+                ngram_text = ' '.join([token for token, tag in ngram])
+                tokens.append(ngram_text)
            return collections.Counter(
-                [token for token, tag in extractor.extract_ngrams(text)]
+#                [token for token, tag in extractor.extract_ngrams(text)]
+                tokens
            )
        else:
            return dict()
    
+#TODO
+#   * make it possible to tag and parse separately
+#   * only tags some data (only titles, titles & abstracts, some chapters...)
+
    """Add a document to the database.
    """
    def create_document(self, parentNode, title, contents, language, metadata, guid=None):
        # create or retrieve a resource for that document, based on its user id
-        if guid is None:
-            resource = Resource(guid=guid)
-        else:
-            try:
-                resource = Resource.get(guid=guid)
-            except:
-                resource = Resource(guid=guid)
-        # If the parent node already has a child with this resource, pass
-        # (is it a good thing?)
-        if parentNode.descendants().filter(resource=resource).exists():
-            return None
+#        if guid is None:
+#            resource = Resource(guid=guid)
+#        else:
+#            try:
+#                resource = Resource.get(guid=guid)
+#            except:
+#                resource = Resource(guid=guid)
+#        # If the parent node already has a child with this resource, pass
+#        # (is it a good thing?)
+#        if parentNode.descendants().filter(resource=resource).exists():
+#            return None
        # create the document itself
        childNode = Node(
-            user        = parentNode.pk,
+            user        = parentNode.user,
            type        = self._document_nodetype,
            name        = title,
            language    = language,
            metadata    = metadata,
-            resource    = resource,
+            #resource    = resource,
            parent      = parentNode
        )
+        childNode.save()
            
        # parse it!
        ngrams = self.extract_ngrams(contents, language)
        # we are already in a transaction, so no use doing another one (or is there?)
        ngramcache = self._ngramcaches[language]
        for terms, occurences in ngrams.items():
-            ngram_text = ' '.join([term[0] for term in terms])
-            ngram = ngramcache[ngram_text]
+            ngram = ngramcache[terms]
            Node_Ngram(
                node       = childNode,
                ngram      = ngram,
@@ -111,7 +122,7 @@ class FileParser:
            ).save()
                
        # return the created document
-        return document
+        return childNode
    
    """Useful method to detect the document encoding.
    Not sure it should be here actually.

--- a/parsing/FileParsers/PubmedFileParser.py
+++ b/parsing/FileParsers/PubmedFileParser.py
 from django.db import transaction
+from lxml import etree
 from parsing.FileParsers.FileParser import FileParser
-
+from parsing.NgramsExtractors import *
+import zipfile
+import datetime

 class PubmedFileParser(FileParser):
    
-    def parse(self, parentNode):
+    def parse(self, parentNode, tag=True):
        # open the file as XML
        xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
-        xml = etree.parse(self._file, parser=xml_parser)
-        # parse all the articles, one by one
-        # all database operations should be performed within one transaction
-        xml_articles = xml.findall('PubmedArticle')
        documents = []
+
        with transaction.atomic():
-            for xml_article in xml_articles:
-                # extract data from the document
-                date_year   = int(xml_article.find('MedlineCitation/DateCreated/Year').text)
-                date_month  = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
-                date_day    = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
-                metadata    = {
-                    # other metadata should also be included:
-                    # authors, submission date, etc.
-                    "date_pub":      datetime.date(year, month, day),
-                    "journal":       xml_article.find('MedlineCitation/Article/Journal/Title').text,
-                    "title":         xml_article.find('MedlineCitation/Article/ArticleTitle').text,
-                    "language_iso3": xml_article.find('MedlineCitation/Article/Language').text,
-                    "doi":           xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]').text
-                }
-                contents    = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
-                # create the document in the database
-                yield self.create_document(
-                    parentNode  = parentNode,
-                document    = self.create_document(
-                    parentNode  = parentNode,
-                    title       = metadata["title"],
-                    contents    = contents,
-                    language    = self._languages_iso3[metadata["language"].lower()],
-                    metadata    = metadata,
-                    guid        = metadata["doi"],
-                )
-                if document:
-                    documents.append(document)
+            with zipfile.ZipFile(self._file) as zipFile:
+                for filename in zipFile.namelist():
+                    file = zipFile.open(filename, "r")
+#                    print(file.read())
+                    xml = etree.parse(file, parser=xml_parser)
+
+                    # parse all the articles, one by one
+                    # all database operations should be performed within one transaction
+                    xml_articles = xml.findall('PubmedArticle')
+                    for xml_article in xml_articles:
+                        # extract data from the document
+                        date_year   = int(xml_article.find('MedlineCitation/DateCreated/Year').text)
+                        date_month  = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
+                        date_day    = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
+                        metadata = {
+                                "date_pub":      '%s-%s-%s' % (date_year, date_month, date_day),
+                                }
+                        metadata_path = {
+                                "journal" : 'MedlineCitation/Article/Journal/Title',
+                                "title" : 'MedlineCitation/Article/ArticleTitle',
+                                "language_iso3" : 'MedlineCitation/Article/Language',
+                                "doi" : 'PubmedData/ArticleIdList/ArticleId[type=doi]',
+                                "abstract" : 'MedlineCitation/Article/Abstract/AbstractText'
+                                }
+                        for key, path in metadata_path.items():
+                            try:
+                                node = xml_article.find(path)
+                                metadata[key] = node.text
+                            except:
+                                metadata[key] = ""
+                        contents    = metadata["abstract"]
+                        # create the document in the database
+                        document    = self.create_document(
+                            parentNode  = parentNode,
+                            title       = metadata["title"],
+                            contents    = contents,
+                            language    = self._languages_iso3[metadata["language_iso3"].lower()],
+                            metadata    = metadata,
+                            #guid        = metadata["doi"],
+                        )
+                        if document:
+                            documents.append(document)
        return documents
--- a/parsing/NgramsExtractors/EnglishNgramsExtractor.py
+++ b/parsing/NgramsExtractors/EnglishNgramsExtractor.py
-from NgramsExtractors.NgramsExtractor import NgramsExtractor
-from Taggers import NltkTagger
+from parsing.NgramsExtractors.NgramsExtractor import NgramsExtractor
+from parsing.Taggers import NltkTagger


 class EnglishNgramsExtractor(NgramsExtractor):

--- a/parsing/NgramsExtractors/FrenchNgramsExtractor.py
+++ b/parsing/NgramsExtractors/FrenchNgramsExtractor.py
-from NgramsExtractors.NgramsExtractor import NgramsExtractor
-from Taggers import TreeTagger
+from parsing.NgramsExtractors.NgramsExtractor import NgramsExtractor
+from parsing.Taggers import TreeTagger


 class FrenchNgramsExtractor(NgramsExtractor):

--- a/parsing/NgramsExtractors/NgramsExtractor.py
+++ b/parsing/NgramsExtractors/NgramsExtractor.py
-from Taggers import Tagger
+from parsing.Taggers import Tagger
 import nltk


@@ -17,9 +17,8 @@ class NgramsExtractor:
    def __del__(self):
        self.stop()
        
-        
    def start(self):
-        self.tagger = Tagger
+        self.tagger = Tagger()
        
    def stop(self):
        pass
@@ -40,7 +39,7 @@ class NgramsExtractor:
        except:
            print("Problem while parsing rule '%s'" % (self._rule, ))
            pass
-        return iter(result)
+        return result
        
        
-    
\ No newline at end of file
+    
--- a/parsing/NgramsExtractors/__init__.py
+++ b/parsing/NgramsExtractors/__init__.py
-from NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
-from NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
\ No newline at end of file
+#from NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
+#from NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
+
+from parsing.NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
+from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
--- a/parsing/Taggers/NltkTagger.py
+++ b/parsing/Taggers/NltkTagger.py
-from Taggers.Tagger import Tagger
+from parsing.Taggers.Tagger import Tagger

 import nltk


--- a/parsing/Taggers/TreeTagger.py
+++ b/parsing/Taggers/TreeTagger.py
-from Taggers.Tagger import Tagger
+from parsing.Taggers.Tagger import Tagger

 import subprocess
 import threading

--- a/parsing/Taggers/__init__.py
+++ b/parsing/Taggers/__init__.py
-from Taggers.NltkTagger import NltkTagger
-from Taggers.TreeTagger import TreeTagger
+from parsing.Taggers.NltkTagger import NltkTagger
+from parsing.Taggers.TreeTagger import TreeTagger
--- a/parsing/test.py
+++ b/parsing/test.py
@@ -2,21 +2,21 @@ from NgramsExtractors import *
 from Taggers import *


+#texts = [
+#    "This is quite a simple test.",
+#    "Forman Brown (1901–1996) was one of the world's leaders in puppet theatre in his day, as well as an important early gay novelist. He was a member of the Yale Puppeteers and the driving force behind Turnabout Theatre. He was born in Otsego, Michigan, in 1901 and died in 1996, two days after his 95th birthday. Brown briefly taught at North Carolina State College, followed by an extensive tour of Europe.",
+#    "James Patrick (born c. 1940) is the pseudonym of a Scottish sociologist, which he used to publish a book A Glasgow Gang Observed. It attracted some attention in Scotland when it was published in 1973. It was based on research he had done in 1966, when he was aged 26. At that time he was working as a teacher in an Approved School, a Scottish reformatory. One gang member in the school, \"Tim Malloy\" (born 1950, also a pseudonym and a generic term for a Glasgow Catholic), agreed to infiltrate him into his gang in Maryhill in Glasgow. Patrick spent four months as a gang member, observing their behaviour.",
+#]
+#tagger = NltkTagger()
+#extractor = EnglishNgramsExtractor()
+#
 texts = [
-    "This is quite a simple test.",
-    "Forman Brown (1901–1996) was one of the world's leaders in puppet theatre in his day, as well as an important early gay novelist. He was a member of the Yale Puppeteers and the driving force behind Turnabout Theatre. He was born in Otsego, Michigan, in 1901 and died in 1996, two days after his 95th birthday. Brown briefly taught at North Carolina State College, followed by an extensive tour of Europe.",
-    "James Patrick (born c. 1940) is the pseudonym of a Scottish sociologist, which he used to publish a book A Glasgow Gang Observed. It attracted some attention in Scotland when it was published in 1973. It was based on research he had done in 1966, when he was aged 26. At that time he was working as a teacher in an Approved School, a Scottish reformatory. One gang member in the school, \"Tim Malloy\" (born 1950, also a pseudonym and a generic term for a Glasgow Catholic), agreed to infiltrate him into his gang in Maryhill in Glasgow. Patrick spent four months as a gang member, observing their behaviour.",
+     "La saison 1921-1922 du Foot-Ball Club Juventus est la vingtième de l'histoire du club, créé vingt-cinq ans plus tôt en 1897. La société turinoise qui fête cette année son 25e anniversaire prend part à l'édition du championnat dissident d'Italie de la CCI (appelé alors la Première division), la dernière édition d'une compétition annuelle de football avant l'ère fasciste de Mussolini.",
+     "Le terme oblong désigne une forme qui est plus longue que large et dont les angles sont arrondis. En langage bibliographique, oblong signifie un format dont la largeur excède la hauteur. Ce qui correspond au format paysage en termes informatiques et \"à l'italienne\", pour l'imprimerie.",
+     "Les sanglots longs des violons de l'automne bercent mon coeur d'une langueur monotone.",
 ]
-tagger = NltkTagger()
-extractor = EnglishNgramsExtractor()
-
-# texts = [
-    # "La saison 1921-1922 du Foot-Ball Club Juventus est la vingtième de l'histoire du club, créé vingt-cinq ans plus tôt en 1897. La société turinoise qui fête cette année son 25e anniversaire prend part à l'édition du championnat dissident d'Italie de la CCI (appelé alors la Première division), la dernière édition d'une compétition annuelle de football avant l'ère fasciste de Mussolini.",
-    # "Le terme oblong désigne une forme qui est plus longue que large et dont les angles sont arrondis. En langage bibliographique, oblong signifie un format dont la largeur excède la hauteur. Ce qui correspond au format paysage en termes informatiques et \"à l'italienne\", pour l'imprimerie.",
-    # "Les sanglots longs des violons de l'automne bercent mon coeur d'une langueur monotone.",
-# ]
-# tagger = TreeTagger()
-# extractor = FrenchNgramsExtractor()
+tagger = TreeTagger()
+extractor = FrenchNgramsExtractor()


 for text in texts:
@@ -25,4 +25,4 @@ for text in texts:
    ngrams = extractor.extract_ngrams(text)
    for ngram in ngrams:
        print("\t" + str(ngram))
-    print("\n")
\ No newline at end of file
+    print("\n")
--- a/test_fileparser.py
+++ b/test_fileparser.py
@@ -6,4 +6,4 @@ node = Node.objects.get(name="PubMed corpus")

 parser = parsing.Parser()

-parser.parse_node_fichier(node)
+#parser.parse_node_fichier(node)
--- a/test_parser_ngramextractor.ipynb
+++ b/test_parser_ngramextractor.ipynb