importateur europresse (with previous ids) ok, isi:ok, bud date for ris

b67643b0 · Administrator · 9968bfff · b67643b0 · b67643b0 · b67643b0
Commit b67643b0 authored Oct 02, 2014 by Administrator
8 changed files
--- a/.ipynb_checkpoints/gargantext_web_tutorial-checkpoint.ipynb
+++ b/.ipynb_checkpoints/gargantext_web_tutorial-checkpoint.ipynb
 {
 "metadata": {
  "name": "",
-  "signature": "sha256:2afae28d08bbb0945aaca44a5b704550048c5dc193cc3d81cb11a551fcc03864"
+  "signature": "sha256:9a933c547abe9d85a6fbe1af7d7d7371d37042e6ad63ac70b7e46acedf7f294f"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
@@ -92,6 +92,38 @@
     "outputs": [],
     "prompt_number": 6
    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import zipfile"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "with zipfile.ZipFile(\"/tmp/date.zip\", 'r') as f:\n",
+      "    for x in f.namelist():\n",
+      "        print(x)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "date.txt\n"
+       ]
+      }
+     ],
+     "prompt_number": 12
+    },
    {
     "cell_type": "markdown",
     "metadata": {},
@@ -696,32 +728,150 @@
     "outputs": [],
     "prompt_number": 12
    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "#REDIS"
+     ]
+    },
    {
     "cell_type": "code",
     "collapsed": false,
-     "input": [],
+     "input": [
+      "BROKER_URL = 'redis://localhost:6379/0'\n",
+      "# redis://:password@hostname:port/db_number"
+     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
-     "prompt_number": 12
+     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
-     "input": [],
+     "input": [
+      "BROKER_TRANSPORT_OPTIONS = {'visibility_timeout': 3600}  # 1 hour."
+     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
-     "prompt_number": 12
+     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
-     "input": [],
+     "input": [
+      "CELERY_RESULT_BACKEND = 'redis://localhost:6379/0'"
+     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
-     "prompt_number": 91
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "BROKER_TRANSPORT_OPTIONS = {'fanout_prefix': True}"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 4
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "BROKER_TRANSPORT_OPTIONS = {'visibility_timeout': 43200}"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 5
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from __future__ import absolute_import\n",
+      "\n",
+      "from celery import Celery\n",
+      "\n",
+      "app = Celery('proj',\n",
+      "             broker='redis://localhost:6379/0',\n",
+      "             backend='redis://localhost:6379/0',\n",
+      "             include=['proj.tasks'])\n",
+      "\n",
+      "# Optional configuration, see the application user guide.\n",
+      "app.conf.update(\n",
+      "    CELERY_TASK_RESULT_EXPIRES=3600,\n",
+      ")\n",
+      "\n",
+      "if __name__ == '__main__':\n",
+      "    pass#app.start()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 9
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from __future__ import absolute_import\n",
+      "\n",
+      "\n",
+      "@app.task\n",
+      "def add(x, y):\n",
+      "    return x + y\n"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 10
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "app.send_task(add(3, 1000))"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 15,
+       "text": [
+        "<AsyncResult: c4807752-eb28-4e0f-b8b9-fce8267bddd3>"
+       ]
+      }
+     ],
+     "prompt_number": 15
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "celery -A proj worker --loglevel=info"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "ename": "SyntaxError",
+       "evalue": "invalid syntax (<ipython-input-16-5806eb0c4fe2>, line 1)",
+       "output_type": "pyerr",
+       "traceback": [
+        "\u001b[1;36m  File \u001b[1;32m\"<ipython-input-16-5806eb0c4fe2>\"\u001b[1;36m, line \u001b[1;32m1\u001b[0m\n\u001b[1;33m    celery -A proj worker --loglevel=info\u001b[0m\n\u001b[1;37m                 ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
+       ]
+      }
+     ],
+     "prompt_number": 16
    },
    {
     "cell_type": "code",

--- a/dependances.deb
+++ b/dependances.deb
--- a/gargantext_web_tutorial.ipynb
+++ b/gargantext_web_tutorial.ipynb
 {
 "metadata": {
  "name": "",
-  "signature": "sha256:c112732dc666c365db7529d3971cad98ecc2bd84cdea3dc8613b609abf19e262"
+  "signature": "sha256:fd8cdda63e0e9cba7dbdfac864550d69bace0f3f834ebd216402a50bc10992b1"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
@@ -148,6 +148,61 @@
     ],
     "prompt_number": 3
    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from lxml import etree"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 5
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "help(etree.parse)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "Help on built-in function parse in module lxml.etree:\n",
+        "\n",
+        "parse(...)\n",
+        "    parse(source, parser=None, base_url=None)\n",
+        "    \n",
+        "    Return an ElementTree object loaded with source elements.  If no parser\n",
+        "    is provided as second argument, the default parser is used.\n",
+        "    \n",
+        "    The ``source`` can be any of the following:\n",
+        "    \n",
+        "    - a file name/path\n",
+        "    - a file object\n",
+        "    - a file-like object\n",
+        "    - a URL using the HTTP or FTP protocol\n",
+        "    \n",
+        "    To parse from a string, use the ``fromstring()`` function instead.\n",
+        "    \n",
+        "    Note that it is generally faster to parse from a file path or URL\n",
+        "    than from an open file object or file-like object.  Transparent\n",
+        "    decompression from gzip compressed sources is supported (unless\n",
+        "    explicitly disabled in libxml2).\n",
+        "    \n",
+        "    The ``base_url`` keyword allows setting a URL for the document\n",
+        "    when parsing from a file-like object.  This is needed when looking\n",
+        "    up external entities (DTD, XInclude, ...) with relative paths.\n",
+        "\n"
+       ]
+      }
+     ],
+     "prompt_number": 6
+    },
    {
     "cell_type": "markdown",
     "metadata": {},

--- a/sources/europresse.py
+++ b/sources/europresse.py
@@ -31,7 +31,7 @@ from lxml import etree
 from documents.models import Document
 #from .corpus import Corpus

-class Europresse(Document):
+class Europresse():
    """
    1) First build tree to parse data
    2) Then each notice (article) is nested in a dictionary,
@@ -45,7 +45,6 @@ class Europresse(Document):

        # Specific declarations for Europresse
        self.data       = []
-        self.object_ids = []

        # Encoding
        self.codif      = "UTF-8"
@@ -177,11 +176,16 @@ class Europresse(Document):
                        'authors': "", 'section': "", 'page':"", 'text': "", 'object_id':""}
                count += 1

-    def add(self, project=None, corpus=None, user=None):
+    def add(self, project=None, corpus=None, user=None, ids=None):
        """ Appends notices to self.corpus from self.data removing duplicates"""
+        if ids is not None:
+            self.object_ids = ids
+        else:
+            self.object_ids = set()
+        
        for i in self.data:
            if i['uniqu_id'] not in self.object_ids and isinstance(i['date'], datetime):
-                self.object_ids.append(i['uniqu_id'])
+                self.object_ids.add(i['uniqu_id'])
                doc = Document()
                
                doc.project = project
@@ -190,7 +194,6 @@ class Europresse(Document):
                doc.date    = i['date']
                doc.uniqu_id= i['uniqu_id']
                doc.title   = i['title']
-                print(doc.project)

                doc.source  = i['source']
                doc.authors = i['authors']
@@ -210,7 +213,6 @@ def demo():
    except Exception as e:
        print("very usefull function", e)
    
-    for a in data.corpus:
        print(a['date'])



--- a/sources/importateur.py
+++ b/sources/importateur.py

 # import Celery here

+from documents.models import Document
 from sources.europresse import Europresse
 from sources.isi import Isi
 from sources.pubmed import Pubmed
@@ -9,58 +10,65 @@ import zipfile

 def importer(source, language, zip_file, project=None, corpus=None, user=None):
    
+    ids = set([ doc.uniqu_id for doc in Document.objects.filter(corpus=corpus)])
+    
    if source.database == "Europresse":
        try:
+            print("Europresse DB detected")
            c = Europresse()
            if zipfile.is_zipfile(zip_file):
                with zipfile.ZipFile(zip_file, 'r') as z:
                    for fichiers in z.namelist():
                        fichier = z.open(fichiers, 'r')
                        c.parse(fichier)
-                        c.add(project=project, corpus=corpus, user=user)
+                        c.add(project=project, corpus=corpus, user=user, ids=ids)

        except Exception as e:
            print(e)
    
-    elif source.database == "Isi":
+    elif source.database == "Web of Science (ISI format)":
        try:
+            print("ISI DB detected")
            c = Isi()
            if zipfile.is_zipfile(zip_file):
                with zipfile.ZipFile(zip_file, 'r') as z:
                    for fichiers in z.namelist():
+                        print("parsing %s" % (fichiers))
                        fichier = z.open(fichiers, 'r')
                        c.parse(fichier, bdd='isi')
-                        c.add(project=project, corpus=corpus, user=user)
+                        c.add(project=project, corpus=corpus, user=user, ids=ids)

        except Exception as e:
            print(e)
    
-    elif source.database == "Ris":
+    elif source.database == "RIS (Zotero)":
        try:
+            print("RIS DB detected")
            c = Isi()
            if zipfile.is_zipfile(zip_file):
                with zipfile.ZipFile(zip_file, 'r') as z:
                    for fichiers in z.namelist():
                        fichier = z.open(fichiers, 'r')
-                        c.parse(fichier)
-                        c.ajouter(project=project, corpus=corpus, user=user)
+                        c.parse(fichier, bdd='ris')
+                        c.add(project=project, corpus=corpus, user=user, ids=ids)

        except Exception as e:
            print(e)

    elif source.database == "Pubmed":
        try:
+            print("PubMed DB detected")
            c = Pubmed()
            if zipfile.is_zipfile(zip_file):
                with zipfile.ZipFile(zip_file, 'r') as z:
                    for fichiers in z.namelist():
                        fichier = z.open(fichiers, 'r')
                        c.parse(fichier)
-                        c.ajouter(project=project, corpus=corpus, user=user)
+                        c.ajouter(project=project, corpus=corpus, user=user, ids=ids)

        except Exception as e:
            print(e)
    else:
-        pass
+        print("Corpus not detected")


--- a/sources/isi.py
+++ b/sources/isi.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+""" 
+ISI parser.
+__author__ : alexandre+gargantext @ delanoe.org
+__licence__ : GPL version 3.0+
+__DATE__ : 2014
+__VERSION__ : 1.0
+"""
+
+
+
 import os, sys
 #reload(sys)
 import re
 import locale
+# import hashlib ?
+
 from datetime import datetime, date
 from dateutil import parser

@@ -22,7 +36,6 @@ class Isi() :
        """
        # Specific declarations for Europresse
        self.data       = []
-        self.object_ids = []

    def read_param(self,file) :
        """
@@ -35,7 +48,7 @@ class Isi() :
        for line in lines:
            if line[0] != '#':
                tag = line.split('\t')
-                tags[tag[1]] = [tag[0], tag[2]]
+                tags[str(tag[1])] = [str(tag[0]), str(tag[2])]
        return tags

    def rules(self, parameters) :
@@ -51,12 +64,17 @@ class Isi() :
        """
        #source = open(file, 'r')
        lines = source.readlines()
-        document = {}
+        doc = {}
        if bdd == 'isi':
-            parameters = self.read_param('sources/parameters/isi.init')
+            try:
+                print("reading parameters ISI")
+                parameters = self.read_param('sources/parameters/isi.init')
+            except Exception as e: print(e)
        elif bdd == 'ris':
-            parameters = self.read_param('sources/parameters/ris.init')
-
+            try:
+                print("reading parameters RIS")
+                parameters = self.read_param('sources/parameters/ris.init')
+            except Exception as e: print(e)

        for key in list(parameters.keys()):
            if parameters[key][0] == 'BEGIN' :
@@ -68,8 +86,14 @@ class Isi() :
                del parameters[end]
        
        for line in lines :
-            if document == {} and line[:2] == begin :
-                document['url'] = " "
+            line = str(line, encoding='UTF-8')
+            
+            if bdd == 'ris':
+                line = line.replace(' - ', '')
+
+            if doc == {} and line[:2] == begin :
+                #print(line)
+                doc['url'] = " "
                key             = ""
                result          = ""

@@ -77,9 +101,9 @@ class Isi() :
                
                if key != "" and key != line[:2]:
                    try:
-                        document[parameters[key][0]] = result
+                        doc[parameters[key][0]] = result
                    except Exception as e: print(e)
-                    #document.setdefault(parameters[key][0],[]).append(result)
+                    #doc.setdefault(parameters[key][0],[]).append(result)
                
                key = line[:2]
                result = line[2:].strip()
@@ -89,49 +113,85 @@ class Isi() :
                    result = result + ' ' + line[2:].strip()#.split(";")
                    
                except Exception as error :
-                    pass
+                    print(error)
            
            elif line[:2] == end :
-                document[parameters[key][0]] = result
-                
+                doc[parameters[key][0]] = result
                try:
                    try: 
-                        date = document['year'] + " " + document['month']
-                        document['date'] = parser.parse(date)
+                        date = doc['year'] + " " + doc['month']
+                        doc['date'] = parser.parse(date)
                    except:
-                        date = document['year']
-                        document['date'] = datetime.strptime(date, '%Y')
+                        date = doc['year']
+                        doc['date'] = datetime.strptime(date, '%Y')

-                except Exception as e: print('88', e)
-                self.data.append(document)
-                document = {}
+                except Exception as e: 
+                    print('88', e)
+                    try:
+                        print(doc['year'])
+                    except Exception as e: print('58',e)
+                self.data.append(doc)
+                doc = {}

-    def add(self, project=None, corpus=None, user=None):
+    def add(self, project=None, corpus=None, user=None, ids=None):
        """ Appends notices to self.corpus from self.data removing duplicates"""
+        if ids is not None:
+            self.object_ids = ids
+        else:
+            self.object_ids = set()
+
        for i in self.data:
+            if 'uniqu_id' not in i.keys():
+                #crypt = md5.new()
+                #crypt.update(i['title'])
+                #i['uniqu_id'] = crypt.digest()
+                i['uniqu_id'] = i['title'] + i['date']
+
            if i['uniqu_id'] not in self.object_ids and isinstance(i['date'], datetime):
-                self.object_ids.append(i['uniqu_id'])
+                self.object_ids.add(i['uniqu_id'])
                doc = Document()
                
-                doc.project = project
-                doc.user    = user
+                try: 
+                    doc.project = project
+                except Exception as e: print(e)
+                
+                try:
+                    doc.user    = user
+                except Exception as e: print(e)

-                doc.date    = i['date']
-                doc.uniqu_id= i['uniqu_id']
-                doc.title   = i['title']
-                print(doc.project)
+                try:
+                    doc.date    = i['date']
+                except Exception as e: print(e)
+                    
+                try:
+                    doc.uniqu_id= i['uniqu_id']
+                except Exception as e: print(e)
+                    
+                try:
+                    doc.title   = i['title']
+                except Exception as e: print(e)

-                doc.source  = i['source']
-                doc.authors = i['authors']
-                doc.text    = i['text']
+                try:
+                    doc.source  = i['source']
+                except Exception as e: print(e)
+                    
+                try:
+                    doc.authors = i['authors']
+                except Exception as e: print(e)
+                    
+                try:
+                    doc.abstract    = i['abstract']
+                except Exception as e: print(e)

-                doc.save()
+                try:
+                    doc.save()
+                except Exception as e: print(e)
+                
                doc.corpus.add(corpus)

        self.data = []


-
 def demo():
    import sys
    data = Isi()

--- a/sources/parameters/isi.init
+++ b/sources/parameters/isi.init
@@ -15,7 +15,7 @@ language	LA	""
 DT	DT	""
 keywords	DE	;
 ID	ID	;
-text	AB	
+abstract	AB	
 ISIC1	C1	\n
 reprint_author	RP	,
 email	EM	\n
@@ -41,5 +41,5 @@ page	PG	""
 field	WC	""
 SC	SC	""
 GA	GA	""
-object_id	UT	""
+uniqu_id	UT	""
 END	ER	""
--- a/sources/parameters/ris.init
+++ b/sources/parameters/ris.init
+##############################################################################
+# LEGEND:
+# NAME (what you want[1])	FIELD (see your data)	SEPARATORS (see your data)
+# 
+# [1] 
+# Be careful to these names variables which do not have to change:
+# BEGIN, ID-unique, END
+##############################################################################
+BEGIN	TY	""
+authors	AU	\n
+AF	AF	"\n"
+title	TI	""
+source	SO	"\n"
+language	LA	""
+DT	DT	""
+keywords	KW	;
+ID	ID	;
+abstract	AB	
+text	ST	,
+ISIC1	C1	\n
+reprint_author	RP	,
+email	EM	\n
+thanks	FX	
+CR	CR	\n
+number	NR	\n
+TC	TC	""
+Z9	Z9	""
+PU	PU	""
+PI	PI	""
+PA	PA	""
+SN	SN	""
+journal_small	J9	""
+JI	JI	""
+month	PD	""
+year	PY	""
+volume	VL	""
+IS	IS	""
+BP	BP	""
+EP	EP	""
+DOI	DI	""
+page	PG	""
+field	WC	""
+SC	SC	""
+GA	GA	""
+uniqu_id	DO	""
+END	ER	""