[FEAT] taggers are now working

1b04dbcf · Mathieu Rodic · b6c68976 · 1b04dbcf · 1b04dbcf · 1b04dbcf
Commit 1b04dbcf authored Feb 17, 2016 by Mathieu Rodic
19 changed files
--- a/gargantext/util/taggers/MeltTagger.py
+++ b/gargantext/util/taggers/MeltTagger.py
+from ._Tagger import Tagger
+from .lib.melttagger.tagger import POSTagger, Token, DAGParser, DAGReader
+
+import subprocess
+import sys
+import os
+
+
+# references for tag equivalents:
+# - http://cs.nyu.edu/grishman/jet/guide/PennPOS.html
+# - http://www.lattice.cnrs.fr/sites/itellier/SEM.html
+class identity_dict(dict):
+    def __missing__(self, key):
+        return key
+_tag_replacements = dict()
+_tag_replacements['fr'] = identity_dict({
+    'DET':      'DT',
+    'NC':       'NN',
+    'NPP':      'NNP',
+    'ADJ':      'JJ',
+    'PONCT':    '.',
+    'ADVWH':    'WRB',
+    'ADV':      'RB',
+    'DETWH':    'WDT',
+    'PROWH':    'WP',
+    'ET':       'FW',
+    'VINF':     'VB',
+    'I':        'UH',
+    'CS':       'IN',
+
+    # 'CLS':      '',
+    # 'CLR':      '',
+    # 'CLO':      '',
+
+    # 'PRO':      '',
+    # 'PROREL':   '',
+    # 'P':        '',
+    # 'P+D':      '',
+    # 'P+PRO':    '',
+
+    # 'V':        '',
+    # 'VPR':      '',
+    # 'VPP':      '',
+    # 'VS':       '',
+    # 'VIMP':     '',
+
+    # 'PREF':     '',
+    # 'ADJWH':    '',
+})
+_tag_replacements['en'] = identity_dict()
+
+
+class MeltTagger(Tagger):
+
+    def __init__(self, *args, **kwargs):
+        self.language = kwargs.pop('language', 'fr')
+        self._tag_replacements = _tag_replacements[self.language]
+        super(self.__class__, self).__init__(*args, **kwargs)
+
+    def start(self, melt_data_path='lib/melttagger'):
+        language = self.language
+        basepath = os.path.dirname(os.path.realpath(__file__))
+        path = os.path.join(basepath, melt_data_path)
+        self._pos_tagger = POSTagger()
+        self._pos_tagger.load_tag_dictionary('%s/%s/tag_dict.json' % (path, language))
+        self._pos_tagger.load_lexicon('%s/%s/lexicon.json' % (path, language))
+        self._pos_tagger.load_model('%s/%s' % (path, language))
+        self._preprocessing_commands = (
+            ('%s/MElt_normalizer.pl' % path, '-nc', '-c', '-d', '%s/%s' % (path, language), '-l', language, ),
+            ('%s/segmenteur.pl' % path, '-a', '-ca', '-af=%s/pctabr' % path, '-p', 'r'),
+        )
+        self._lemmatization_commands = (
+            ('%s/MElt_postprocess.pl' % path, '-npp', '-l', language),
+            ('%s/MElt_lemmatizer.pl' % path, '-m', '%s/%s' % (path, language)),
+        )
+
+    def stop(self):
+        pass
+
+    def _pipe(self, text, commands, encoding='utf8'):
+        text = text.encode(encoding)
+        for command in commands:
+            process = subprocess.Popen(
+                command,
+                bufsize=0,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+            text, err = process.communicate(text)
+            if len(err):
+                print(err.decode(encoding), file=sys.stderr)
+        return text.decode(encoding)
+
+    def _tag(self, text):
+        preprocessed = self._pipe(text, self._preprocessing_commands)
+        for sentence in preprocessed.split('\n'):
+            words = sentence.split(' ')
+            tokens = [Token(word) for word in words]
+            tagged_tokens = self._pos_tagger.tag_token_sequence(tokens)
+            for token in tagged_tokens:
+                if len(token.string):
+                    yield (token.string, token.label, )
+
+    def tag_text(self, text, lemmatize=False):
+        tagged_tokens = self._tag(text)
+        # without lemmatization
+        if not lemmatize:
+            for form, tag in tagged_tokens:
+                yield (form, self._tag_replacements[tag])
+            return
+        # with lemmatization
+        command_input = ' '.join(
+            '%s/%s' % (token, tag)
+            for token, tag in tagged_tokens
+        )
+        lemmatized = self._pipe(command_input, self._lemmatization_commands)
+        for token in lemmatized.split():
+            if len(token):
+                values = token.split('/')
+                yield (values[0], self._tag_replacements[values[1]], values[2].replace('*', ''))
+
+
+def EnglishMeltTagger(*args, **kwargs):
+    kwargs['language'] = 'en'
+    return MeltTagger(*args, **kwargs)
+
+def FrenchMeltTagger(*args, **kwargs):
+    kwargs['language'] = 'fr'
+    return MeltTagger(*args, **kwargs)
--- a/gargantext/util/taggers/NltkTagger.py
+++ b/gargantext/util/taggers/NltkTagger.py
+from ._Tagger import Tagger
+
+import nltk
+
+
+class NltkTagger(Tagger):
+
+    def tag_tokens(self, tokens, single=True):
+        return nltk.pos_tag(tokens)
--- a/gargantext/util/taggers/TreeTagger.py
+++ b/gargantext/util/taggers/TreeTagger.py
+from ._Tagger import Tagger
+
+import subprocess
+import threading
+import time
+import os
+
+
+# TODO: have a look at "queue" instead of "list" (cf. http://stackoverflow.com/questions/17564804/in-python-how-to-wait-until-only-the-first-thread-is-finished)
+
+class identity_dict(dict):
+    def __missing__(self, key):
+        return key
+
+_tag_replacements = identity_dict({
+    "NOM": "NN",
+    "NAM": "NN",
+    "ADJ": "NN",
+    "VER": "JJ",
+    "PREP": "PRP",
+    "KON": "CC",
+    "DET": "DT",
+    "PRO": "DT",
+    # Do we also have to take semicolons, comas and other points into account?
+})
+
+def _readOutput(output, buffer):
+    hasStarted = False
+    while True:
+        line = output.readline()
+        if line:
+            if line == b"<block>\n":
+                hasStarted = True
+                continue
+            if line == b"<block/>\n":
+                break
+            if hasStarted:
+                token, tag = line.decode('utf8').split()[:2]
+                tag = _tag_replacements[tag.split(':')[0]]
+                buffer.append((token, tag))
+        else:
+            time.sleep(0.1)
+
+
+"""Use TreeTagger for the tagging.
+Shall be used for french texts.
+"""
+class TreeTagger(Tagger):
+
+    def start(self, treeTaggerPath='./lib/treetagger'):
+        print(treeTaggerPath)
+        if treeTaggerPath[0] == '.':
+            treeTaggerPath = '%s/%s' % (os.path.dirname(os.path.realpath(__file__)), treeTaggerPath, )
+            print(treeTaggerPath)
+        binaryFile = "%s/bin/tree-tagger" % (treeTaggerPath, )
+        print(binaryFile)
+        tagcmdlist = [
+            binaryFile,
+            "%s/lib/french-utf8.par" % treeTaggerPath,
+            "-token",
+            "-lemma",
+            "-sgml",
+            "-quiet"
+        ]
+        self._popen = subprocess.Popen(
+            tagcmdlist,     # Use a list of params in place of a string.
+            bufsize=0,      # Not buffered to retrieve data asap from TreeTagger
+            executable=binaryFile, # As we have it, specify it
+            stdin=subprocess.PIPE,  # Get a pipe to write input data to TreeTagger process
+            stdout=subprocess.PIPE, # Get a pipe to read processing results from TreeTagger
+            stderr=subprocess.PIPE, # Get a pipe to read processing results from TreeTagger
+        )
+        self._input, self._output = self._popen.stdin, self._popen.stdout
+        # self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
+        # self.buffer = OutputBuffer()
+
+    def stop(self):
+        # terminates the 'treetagger' process
+        try:
+            self._popen.kill()
+            self._popen.terminate()
+        except:
+            pass
+
+    def tagging_start(self):
+        self.buffer = []
+        self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, ))
+        self._thread.start()
+        self._input.write(b"<block>\n")
+
+    def tagging_end(self):
+        self._input.write(b"<block/>\n")
+        # sends some dummy tokens, then wait for the text to be treated
+        self.tag_tokens("Les sanglots longs des violons de l ' automne bercent mon coeur d ' une langueur monotone .".split(), False)
+        self._thread.join()
+
+
+    def tag_tokens(self, tokens, single=True):
+        if single:
+            self.tagging_start()
+        for token in tokens:
+            self._input.write(bytes(token + "\n", "utf8"))
+        if single:
+            self.tagging_end()
+            return self.buffer
+
+    def tag_text(self, text):
+        self.tagging_start()
+        for line in text.split('\n'):
+            tokens = self._re_sentence.findall(line)
+            self.tag_tokens(tokens, False)
+        self.tagging_end()
+        return self.buffer
--- a/gargantext/util/taggers/TurboTagger.py
+++ b/gargantext/util/taggers/TurboTagger.py
+# from ._Tagger import Tagger
+from .lib.nlpserver.client import NLPClient
+
+
+class TurboTagger:
+
+    def start(self):
+        self._nlpclient = NLPClient()
+
+    def stop(self):
+        if hasattr(self, '_nlpclient'):
+            del self._nlpclient
+
+    def tag_text(self, text):
+        if not hasattr(self, '_nlpclient'):
+            self._nlpclient = NLPClient()
+        tokens_tags = []
+        for sentence in self._nlpclient.tag(text):
+            for token, tag in sentence:
+                tokens_tags.append((token, tag, ))
+        return tokens_tags
--- a/gargantext/util/taggers/_Tagger.py
+++ b/gargantext/util/taggers/_Tagger.py
+"""This base class is a model for performing tagging in a pipeline fashion.
+When started, it initiates the parser;
+when passed text, the text is piped to the parser.
+When ended, the parser is closed and the tagged word returned as a tuple.
+"""
+
+import re
+
+
+class Tagger:
+
+    def __init__(self):
+        # This regular expression is really good at tokenizing a text!
+        self._re_sentence = re.compile(r'''(?x)  # set flag to allow verbose regexps
+            (?:[A-Z])(?:\.[A-Z])+\.?        # abbreviations, e.g. U.S.A.
+            | \w+(?:-\w+)*                  # words with optional internal hyphens
+            | \$?\d+(?:\.\d+)?%?            # currency and percentages, e.g. $12.40, 82%
+            | \.\.\.                        # ellipsis
+            | [][.,;"'?!():-_`]             # these are separate tokens
+            ''', re.UNICODE | re.MULTILINE | re.DOTALL)
+        self.buffer = []
+        self.start()
+
+    def __del__(self):
+        self.stop()
+
+    def start(self):
+        """Initializes the tagger.
+        This method is called by the constructor, and can be overriden by
+        inherited classes.
+        """
+
+    def stop(self):
+        """Ends the tagger.
+        This method is called by the destructor, and can be overriden by
+        inherited classes.
+        """
+
+    def tagging_start(self):
+        """This method is userful in the case of pipelines requiring
+        boundaries around blocks of text.
+        """
+
+    def tagging_end(self):
+        pass
+
+    def tag_tokens(self, tokens, single=True):
+        """Returns the tagged tokens.
+        This method shall be overriden by inherited classes.
+        Example of input: ['This', 'is', 'not', 'a', 'sentence', '.']
+        Example of output: [('This', 'DT'), ('is', 'VBZ'), ('not', 'RB'), ('a', 'DT'), ('sentence', 'NN'), ('.', '.')]
+        """
+        if single:
+            self.tagging_start()
+        # do something with the tokens here
+        if single:
+            self.tagging_end()
+        return []
+
+
+    # Not used right now
+    def tag_text(self, text):
+        """Send a text to be tagged.
+        """
+        tokens_tags = []
+        self.tagging_start()
+        for line in text.split('\n'):
+            tokens = self._re_sentence.findall(line)
+            tokens_tags += self.tag_tokens(tokens, False)
+        self.tagging_end()
+        return tokens_tags
--- a/gargantext/util/taggers/__init__.py
+++ b/gargantext/util/taggers/__init__.py
+from .TurboTagger import TurboTagger
+from .NltkTagger import NltkTagger
+from .TreeTagger import TreeTagger
+from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger
--- a/gargantext/util/taggers/lib/melttagger
+++ b/gargantext/util/taggers/lib/melttagger
+/srv/gargantext_lib/taggers/melttagger
\ No newline at end of file
--- a/gargantext/util/taggers/lib/nlpserver/README.md
+++ b/gargantext/util/taggers/lib/nlpserver/README.md
+GETTING STARTED
+===============
+
+* Download the following files (if all you need is tagging, the second
+   archive is not necessary):
+    - http://www.ark.cs.cmu.edu/TurboParser/sample_models/english_proj_tagger.tar.gz
+    - http://www.ark.cs.cmu.edu/TurboParser/sample_models/english_proj_parser.tar.gz
+
+* Expand them, and place the extract files in the `data` directory
+
+
+CONFIGURATION
+=============
+
+The settings for the server can be found in `settings.py`.
+Please ensure the TCP port is not already in use on your machine, and that the path to the models are correct.
+
+START for tests
+===============
+python3 server.py
+"CTRL + c" to shut down
+
+START/STOP THE SERVER
+=====================
+
+Simply run the following command to start: `./nlpserver start`
+To stop: `./nlpserver stop`
+
+If starting the server failed, have a look at the log in `nlpserver.log`.
--- a/gargantext/util/taggers/lib/nlpserver/client.py
+++ b/gargantext/util/taggers/lib/nlpserver/client.py
+import socket
+import sys
+import re
+
+from .settings import server_type_client, server_host, server_port, server_buffer
+from .settings import implemented_methods
+
+
+class NLPClient:
+
+    def __init__(self):
+        self._socket = None
+        for method_name in dir(self):
+            if method_name[0] != '_':
+                if method_name.upper() not in implemented_methods:
+                    setattr(self, method_name, self._notimplemented)
+
+    def __del__(self):
+        self._disconnect()
+
+    def _connect(self):
+        self._disconnect()
+        self._socket = socket.socket(*server_type_client)
+        self._socket.connect((server_host, server_port))
+
+    def _disconnect(self):
+        if self._socket is not None:
+            self._socket.close()
+            self._socket = None
+
+    def _notimplemented(self, *args, **kwargs):
+        raise NotImplementedError(
+            'Only the following methods are allowed: {}'.format(
+                ', '.join(implemented_methods)
+            )
+        )
+
+    def _getline(self):
+        """Get one line of text from the buffer
+        """
+        buf = self._socket.recv(server_buffer).decode()
+        done = False
+        while not done:
+            if '\n' in buf:
+                line, buf = buf.split('\n', 1)
+                yield line
+            else:
+                more = self._socket.recv(server_buffer).decode()
+                if not more:
+                    done = True
+                else:
+                    buf += more
+        if buf:
+            yield buf
+
+    def _request(self, action, text, language, keys=None):
+        """Generic method to request info from the server
+        """
+        data = action + ' '
+        data += language + '\n'
+        data += re.sub(r'\n+', '\n', text)
+        data += '\n\n'
+        self._connect()
+        self._socket.sendall(data.encode())
+        sentence = []
+        if keys is None:
+            for line in self._getline():
+                if not line:
+                    if not sentence:
+                        break
+                    yield sentence
+                    sentence = []
+                    continue
+                sentence.append(line.split('\t'))
+        else:
+            for line in self._getline():
+                if not line:
+                    if not sentence:
+                        break
+                    yield sentence
+                    sentence = []
+                    continue
+                values = line.split('\t')
+                sentence.append(dict(zip(keys, line.split('\t'))))
+
+    def tokenize(self, text, language='english', asdict=False):
+        keys = ('token', ) if asdict else None
+        return self._request('TOKENIZE', text, language, keys)
+
+    def tag(self, text, language='english', asdict=False):
+        keys = ('token', 'tag', ) if asdict else None
+        return self._request('TAG', text, language, keys)
+
+    def lemmatize(self, text, language='english', asdict=False):
+        keys = ('token', 'tag', 'lemma') if asdict else None
+        return self._request('LEMMATIZE', text, language, keys)
+
+    def parse(self, text, language='english', asdict=False):
+        keys = ('token', 'tag', 'lemma', 'head', 'deprel', ) if asdict else None
+        return self._request('PARSE', text, language, keys)
+
+
+# Benchmark when the script is called directly
+if __name__ == '__main__':
+    from time import time
+    text = """Current therapeutics for schizophrenia, the typical and atypical antipsychotic class of drugs, derive their therapeutic benefit predominantly by antagonism of the dopamine D2 receptor subtype and have robust clinical benefit on positive symptoms of the disease with limited to no impact on negative symptoms and cognitive impairment. Driven by these therapeutic limitations of current treatments and the recognition that transmitter systems beyond the dopaminergic system in particular glutamatergic transmission contribute to the etiology of schizophrenia significant recent efforts have focused on the discovery and development of novel treatments for schizophrenia with mechanisms of action that are distinct from current drugs. Specifically, compounds selectively targeting the metabotropic glutamate receptor 2/3 subtype, phosphodiesterase subtype 10, glycine transporter subtype 1 and the alpha7 nicotinic acetylcholine receptor have been the subject of intense drug discovery and development efforts. Here we review recent clinical experience with the most advanced drug candidates targeting each of these novel mechanisms and discuss whether these new agents are living up to expectations."""
+    text = open('/home/mat/projects/parser/animal-farm.txt').read()
+    client = NLPClient()
+    iterations = int(sys.argv[1]) if len(sys.argv) > 1 else 1
+    for asdict in (False, True):
+        print()
+        print('Retrieving results as ' + (
+            'dict' if asdict else 'list'
+        ) + 's')
+        print('---------------------------')
+        for method_name in dir(client):
+            if method_name[0] != '_':
+                method = getattr(client, method_name)
+                print('%-16s' % method_name, end='')
+                t0 = time()
+                n = 0.0
+                for i in range(0, iterations):
+                    try:
+                        for sentence in method(text, asdict=asdict):
+                            n += 1.0
+                        t = time() - t0
+                        print('%8.2f s %8.2f ms per sentence' % (t, 1000*t/n if n else 0.0))
+                    except NotImplementedError:
+                        print('(not implemented)')
+    print()
+    
+    # lemmatize           2.89 s     1.76 ms per sentence
+    # parse              25.21 s    15.37 ms per sentence
+    # tag                 2.90 s     1.77 ms per sentence
+    # tokenize            0.19 s     0.12 ms per sentence
--- a/gargantext/util/taggers/lib/nlpserver/data
+++ b/gargantext/util/taggers/lib/nlpserver/data
+/srv/gargantext_lib/taggers/nlpserver/data
\ No newline at end of file
--- a/gargantext/util/taggers/lib/nlpserver/lemmatizer.py
+++ b/gargantext/util/taggers/lib/nlpserver/lemmatizer.py
+from nltk.stem import WordNetLemmatizer
+from collections import defaultdict
+
+lemmatizer = WordNetLemmatizer()
+_lemmatize = lemmatizer.lemmatize
+tags_translate = defaultdict(str)
+tags_translate.update({
+    'J': 'a',
+    'N': 'n',
+    'V': 'v',
+})
+
+def lemmatize(token, tag):
+    tag_type = tags_translate[tag[0]]
+    return _lemmatize(token, tag_type) if tag_type else token
+
--- a/gargantext/util/taggers/lib/nlpserver/nlpserver
+++ b/gargantext/util/taggers/lib/nlpserver/nlpserver
+#!/bin/sh
+ 
+# In case this bash file is placed in another directory (e.g., /etc/init.d),
+# the following line should be changed to an absolute path
+DAEMON_DIR=$( cd "$(dirname "$BASH_SOURCE[0]")" && pwd)
+DAEMON_SCRIPT=$DAEMON_DIR/server.py
+DAEMON_NAME=nlpserver
+DAEMON_ARGS= 
+# DAEMON_USER=root
+ 
+# The process ID of the script when it runs is stored here:
+DAEMON_PID=/tmp/$DAEMON_NAME.pid
+ 
+. /lib/lsb/init-functions
+ 
+do_start () {
+    log_daemon_msg "Starting system '$DAEMON_NAME' daemon..."
+    /sbin/start-stop-daemon --start --quiet \
+        --make-pidfile --pidfile $DAEMON_PID --background \
+        --startas /bin/bash -- -c "python3 $DAEMON_SCRIPT $DAEMON_ARGS > /tmp/$DAEMON_NAME.log 2>&1"
+        # --exec $DAEMON_SCRIPT \
+        # --user $DAEMON_USER --chuid $DAEMON_USER
+    log_end_msg $?
+}
+do_stop () {
+    log_daemon_msg "Stopping system '$DAEMON_NAME' daemon..."
+    /sbin/start-stop-daemon --stop --pidfile $DAEMON_PID --retry 10
+    log_end_msg $?
+}
+ 
+case "$1" in
+ 
+    start|stop)
+        do_${1}
+        ;;
+ 
+    restart|reload|force-reload)
+        do_stop
+        do_start
+        ;;
+ 
+    status)
+        ps -e | grep "`cat $DAEMON_PID` "
+        ;;
+    *)
+        echo "Usage: $DAEMON_NAME {start|stop|restart|status}"
+        exit 1
+        ;;
+ 
+esac
+exit 0
--- a/gargantext/util/taggers/lib/nlpserver/pipeline.py
+++ b/gargantext/util/taggers/lib/nlpserver/pipeline.py
+from settings import *
+from sys import stderr
+
+def print(text):
+    stderr.write(text + '\n')
+
+print('PREPARING TURBOPARSER')
+import turboparser
+turbo_interface = turboparser.PTurboParser()
+
+print('LOADING TOKENIZERS')
+import nltk
+sentence_tokenizer = nltk.data.load(tokenizer_model)
+word_tokenizer = nltk.TreebankWordTokenizer()
+
+if 'TAG' in implemented_methods or 'LEMMATIZE' in implemented_methods:
+    print('LOADING TAGGER')
+    tagger = turbo_interface.create_tagger()
+    tagger.load_tagger_model(b_tagger_model)
+
+if 'LEMMATIZE' in implemented_methods or 'TAG' in implemented_methods or 'PARSE' in implemented_methods:
+    print('LOADING LEMMATIZER')
+    from lemmatizer import lemmatize
+
+if 'PARSE' in implemented_methods:
+    print('LOADING PARSER')
+    parser = turbo_interface.create_parser()
+    parser.load_parser_model(b_parser_model)
+
+
+def split_sentences(text):
+    return sentence_tokenizer.tokenize(text)
+
+def tokenize(sentence):
+    return word_tokenizer.tokenize(sentence)
+
+def tag_sentence(sentence):
+    # Write tokens to input file
+    f_input = open(tmp_input_path, 'w')
+    for token in tokenize(sentence):
+        f_input.write(token + '\t_\n')
+    f_input.close()
+    # Tag tokens
+    tagger.tag(b_tmp_input_path, b_tmp_output_path)
+    # Iterate through tagged tokens
+    f_output = open(tmp_output_path)
+    for line in f_output:
+        line = line.rstrip('\n')
+        if line == '':
+            continue
+        token, tag = line.split('\t')
+        yield (token, tag)
+    f_output.close()
+
+def tag_lemmatize_sentence(sentence):
+    # Write tokens to input file
+    f_input = open(tmp_input_path, 'w')
+    for token in tokenize(sentence):
+        f_input.write(token + '\t_\n')
+    f_input.close()
+    # Tag tokens
+    tagger.tag(b_tmp_input_path, b_tmp_output_path)
+    # Iterate through tagged tokens
+    f_output = open(tmp_output_path)
+    for line in f_output:
+        line = line.rstrip('\n')
+        if line == '':
+            continue
+        token, tag = line.split('\t')
+        lemma = lemmatize(token, tag)
+        yield (token, tag, lemma)
+    f_output.close()
+
+def parse_sentence(sentence):
+    # Write tokens to input file
+    f_input = open(tmp_input_path, 'w')
+    # Iterate through tagged tokens, prepare input
+    i = 0
+    for token, tag, lemma in tag_lemmatize_sentence(sentence):
+        i += 1
+        f_input.write(
+            # position
+            str(i) + '\t' +
+            # token
+            token + '\t' +
+            # lemma
+            lemma + '\t' +
+            # tag (twice)
+            tag + '\t' +
+            tag + '\t' +
+            # filler
+            '_\t_\t_\n'
+        )
+    f_input.close()
+    # Parse sentence
+    parser.parse(b_tmp_input_path, b_tmp_output_path)
+    # Iterate through parsed stuff
+    f_output = open(tmp_output_path)
+    for line in f_output:
+        line = line.rstrip('\n')
+        if line == '':
+            continue
+        fields = line.split('\t')
+        #
+        token = fields[1]
+        lemma = fields[2]
+        tag = fields[3]
+        head = str(int(fields[6]) - 1)
+        deprel = fields[7]
+        yield (token, tag, head, deprel)
--- a/gargantext/util/taggers/lib/nlpserver/server.py
+++ b/gargantext/util/taggers/lib/nlpserver/server.py
+#!python3
+
+import pipeline
+import socketserver
+
+from settings import server_type_server, server_host, server_port, server_timeout
+from settings import b_implemented_methods
+
+
+actions = {
+    b'TAG': pipeline.tag_sentence,
+    b'LEMMATIZE': pipeline.tag_lemmatize_sentence,
+    b'PARSE': pipeline.parse_sentence,
+}
+
+class NLPServer(socketserver.StreamRequestHandler):
+
+    def handle(self):
+        # What kind of request are we handling?
+        firstline = self.rfile.readline()
+        parameters = firstline.split()
+        if len(parameters) != 2:
+            self.wfile.write(b'\n\n')
+            return
+        action, language = parameters
+        if action not in b_implemented_methods:
+            self.wfile.write(b'\n\n')
+            return
+        # Get the text data
+        text = ''
+        while True:
+            line = self.rfile.readline().decode()
+            if not line.strip():
+                break
+            text += line
+            text += '\n'
+        # Execute the action
+        method = actions.get(action, None)
+        if method is None:
+            for sentence in pipeline.split_sentences(text):
+                for token in pipeline.tokenize(sentence):
+                    self.wfile.write(
+                        token.encode() + b'\n'
+                    )
+                self.wfile.write(b'\n')
+            self.wfile.write(b'\n')
+        else:
+            for sentence in pipeline.split_sentences(text):
+                for row in method(sentence):
+                    self.wfile.write(
+                        (
+                            '\t'.join(row)
+                        ).encode() + b'\n'
+                    )
+                self.wfile.write(b'\n')
+            self.wfile.write(b'\n')
+
+    def handle_timeout(self):
+        self.request.sendall(b'\n\n')
+
+        
+if __name__ == '__main__':
+    print('STARTING TCP SERVER')
+    server = server_type_server((server_host, server_port), NLPServer)
+    server.timeout = server_timeout
+    try:
+        server.serve_forever()
+    except (KeyboardInterrupt, SystemExit):
+        print('STOPPING TCP SERVER')
+        server.shutdown()
--- a/gargantext/util/taggers/lib/nlpserver/settings.py
+++ b/gargantext/util/taggers/lib/nlpserver/settings.py
+import os
+import socket
+import socketserver
+
+# Server parameters
+server_host = 'localhost'
+server_port = 7777
+server_type_server = socketserver.TCPServer
+server_type_client = socket.AF_INET, socket.SOCK_STREAM
+server_timeout = 2.0
+server_buffer = 4096
+
+# Implemented methods (other are treated as 'tokenize')
+implemented_methods = {'TOKENIZE', 'TAG', 'LEMMATIZE'}
+# server_methods = {'TOKENIZE', 'TAG', 'LEMMATIZE', 'PARSE'}
+b_implemented_methods = {name.encode() for name in implemented_methods}
+
+# Models
+data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')
+tokenizer_model = os.path.join(data_dir, 'english.pickle')
+tagger_model = os.path.join(data_dir, 'english_proj_tagger.model')
+# parser_model = 'data/210basic_sd330'
+parser_model = os.path.join(data_dir, 'english_proj_parser_pruned-true_model-full.model')
+b_tagger_model = tagger_model.encode()
+b_parser_model = parser_model.encode()
+
+# Temporary files access
+tmp_input_path = '/tmp/nlpserver_input.tmp'
+tmp_output_path = '/tmp/nlpserver_output.tmp'
+b_tmp_input_path = tmp_input_path.encode()
+b_tmp_output_path = tmp_output_path.encode()
--- a/gargantext/util/taggers/lib/nlpserver/test.py
+++ b/gargantext/util/taggers/lib/nlpserver/test.py
--- a/gargantext/util/taggers/lib/nlpserver/turboparser.cpython-34m.so
+++ b/gargantext/util/taggers/lib/nlpserver/turboparser.cpython-34m.so
+/srv/gargantext_lib/taggers/nlpserver/turboparser.cpython-34m.so
\ No newline at end of file
--- a/gargantext/util/taggers/lib/treetagger
+++ b/gargantext/util/taggers/lib/treetagger
+/srv/gargantext_lib/taggers/treetagger
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,6 +14,7 @@ html5lib==0.9999999
 jdatetime==1.7.2
 kombu==3.0.33
 lxml==3.5.0
+nltk==3.1
 psycopg2==2.6.1
 pycountry==1.20
 python-dateutil==2.4.2