[FIX] real path for lib/parsers

6a7b8b89 · Administrator · 44871a3f · 44871a3f · 6a7b8b89 · 6a7b8b89
Commit 6a7b8b89 authored May 21, 2015 by Administrator
10 changed files
--- a/parsing/Taggers
+++ b/parsing/Taggers
-/srv/gargantext-taggers
\ No newline at end of file
--- a/parsing/Taggers/Melt.py
+++ b/parsing/Taggers/Melt.py
+from .Tagger import Tagger
+
+import subprocess
+import threading
+import time
+
+
+# TODO: have a look at "queue" instead of "list" (cf. http://stackoverflow.com/questions/17564804/in-python-how-to-wait-until-only-the-first-thread-is-finished)
+
+class identity_dict(dict):
+    def __missing__(self, key):
+        return key
+    
+_tag_replacements = identity_dict({
+    "NOM": "NN",
+    "NAM": "NN",
+    "ADJ": "NN",
+    "VER": "JJ",
+    "PREP": "PRP",
+    "KON": "CC",
+    "DET": "DT",
+    "PRO": "DT",
+    # Do we also have to take semicolons, comas and other points into account?
+})
+
+def _readOutput(output, buffer):
+    hasStarted = False
+    while True:
+        line = output.readline()
+        if line:
+            if line == b"<block>\n":
+                hasStarted = True
+                continue
+            if line == b"<block/>\n":
+                break
+            if hasStarted:
+                token, tag = line.decode('utf8').split()[:2]
+                tag = _tag_replacements[tag.split(':')[0]]
+                buffer.append((token, tag))
+        else:
+            time.sleep(0.1)
+
+
+"""Use MElt for the tagging.
+"""
+class Melt(Tagger):
+    
+    def start(self, taggerPath = "/usr/local/bin/"):
+        binaryFile = "%s/MElt" % taggerPath
+        tagcmdlist = [
+            binaryFile,
+            "-l",
+        ]
+
+        tagcmdlist = []
+        self._popen = subprocess.Popen(
+            tagcmdlist,     # Use a list of params in place of a string.
+            bufsize=0,      # Not buffered to retrieve data asap from Tagger
+            executable=binaryFile, # As we have it, specify it
+            stdin=subprocess.PIPE,  # Get a pipe to write input data to Tagger process
+            stdout=subprocess.PIPE, # Get a pipe to read processing results from Tagger
+            stderr=subprocess.PIPE, # Get a pipe to read processing results from Tagger
+        )
+        self._input, self._output = self._popen.stdin, self._popen.stdout
+        # self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
+        # self.buffer = OutputBuffer()
+        
+    def stop(self):
+        # terminates the process
+        try:
+            self._popen.kill()
+            self._popen.terminate()
+        except:
+            pass
+        
+    def tagging_start(self):
+        self.buffer = []
+        self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, ))
+        self._thread.start()
+        #self._input.write(b"<block>\n")
+        
+    def tagging_end(self):
+        #self._input.write(b"<block/>\n")
+        # sends some dummy tokens, then wait for the text to be treated
+        #self.tag_tokens("Les sanglots longs des violons de l ' automne bercent mon coeur d ' une langueur monotone .".split(), False)
+        self._thread.join()
+        
+        
+    def tag_tokens(self, tokens, single=True):
+        if single:
+            self.tagging_start()
+        for token in tokens:
+            self._input.write(bytes(token + "\n", "utf8"))
+        if single:
+            self.tagging_end()
+            return self.buffer
+            
+    def tag_text(self, text):
+        self.tagging_start()
+        for line in text.split('\n'):
+            tokens = self._re_sentence.findall(line)
+            self.tag_tokens(tokens, False)
+        self.tagging_end()
+        return self.buffer
+
--- a/parsing/Taggers/MeltTagger.py
+++ b/parsing/Taggers/MeltTagger.py
+from .Tagger import Tagger
+from .lib.melttagger.tagger import POSTagger, Token, DAGParser, DAGReader
+
+import subprocess
+import sys
+import os
+
+
+# references for tag equivalents:
+# - http://cs.nyu.edu/grishman/jet/guide/PennPOS.html
+# - http://www.lattice.cnrs.fr/sites/itellier/SEM.html
+class identity_dict(dict):
+    def __missing__(self, key):
+        return key
+_tag_replacements = identity_dict({
+    'DET':      'DT',
+    'NC':       'NN',
+    'NPP':      'NNP',
+    'ADJ':      'JJ',
+    'PONCT':    '.',
+    'ADVWH':    'WRB',
+    'ADV':      'RB',
+    'DETWH':    'WDT',
+    'PROWH':    'WP',
+    'ET':       'FW',
+    'VINF':     'VB',
+    'I':        'UH',
+    'CS':       'IN',
+
+    # 'CLS':      '',
+    # 'CLR':      '',
+    # 'CLO':      '',
+
+    # 'PRO':      '',
+    # 'PROREL':   '',
+    # 'P':        '',
+    # 'P+D':      '',
+    # 'P+PRO':    '',
+
+    # 'V':        '',
+    # 'VPR':      '',
+    # 'VPP':      '',
+    # 'VS':       '',
+    # 'VIMP':     '',
+
+    # 'PREF':     '',
+    # 'ADJWH':    '',
+})
+
+
+class MeltTagger(Tagger):
+
+    def start(self, language='fr', melt_data_path='lib/melttagger'):
+        basepath = os.path.dirname(os.path.realpath(__file__))
+        path = os.path.join(basepath, melt_data_path)
+        self._pos_tagger = POSTagger()
+        self._pos_tagger.load_tag_dictionary('%s/%s/tag_dict.json' % (path, language))
+        self._pos_tagger.load_lexicon('%s/%s/lexicon.json' % (path, language))
+        self._pos_tagger.load_model('%s/%s' % (path, language))
+        self._preprocessing_commands = (
+            # ('/usr/local/bin/clean_noisy_characters.sh', ),
+            ('%s/MElt_normalizer.pl' % path, '-nc', '-c', '-d', '%s/%s' % (path, language), '-l', language, ),
+            ('%s/segmenteur.pl' % path, '-a', '-ca', '-af=%s/pctabr' % path, '-p', 'r'),
+        )
+        self._lemmatization_commands = (
+            ('%s/MElt_postprocess.pl' % path, '-npp', '-l', language),
+            ('%s/MElt_lemmatizer.pl' % path, '-m', '%s/%s' % (path, language)),
+        )
+
+    def stop(self):
+        pass
+
+    def _pipe(self, text, commands, encoding='utf8'):
+        text = text.encode(encoding)
+        for command in commands:
+            process = subprocess.Popen(
+                command,
+                bufsize=0,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+            text, err = process.communicate(text)
+            if len(err):
+                print(err.decode(encoding), file=sys.stderr)
+        return text.decode(encoding)
+
+    def _tag(self, text):
+        preprocessed = self._pipe(text, self._preprocessing_commands)
+        for sentence in preprocessed.split('\n'):
+            words = sentence.split(' ')
+            tokens = [Token(word) for word in words]
+            tagged_tokens = self._pos_tagger.tag_token_sequence(tokens)
+            for token in tagged_tokens:
+                if len(token.string):
+                    yield (token.string, _tag_replacements[token.label], )
+
+    def tag_text(self, text, lemmatize=True):
+        tagged_tokens = self._tag(text)
+        if not lemmatize:
+            for tagged_token in tagged_tokens:
+                yield tagged_token
+            return
+        # lemmatization
+        command_input = ' '.join(
+            '%s/%s' % (token, tag)
+            for token, tag in tagged_tokens
+        )
+        lemmatized = self._pipe(command_input, self._lemmatization_commands)
+        for token in lemmatized.split():
+            if len(token):
+                values = token.split('/')
+                yield (values[0], values[1], values[2].replace('*', ''))
--- a/parsing/Taggers/NltkTagger.py
+++ b/parsing/Taggers/NltkTagger.py
+from .Tagger import Tagger
+
+import nltk
+
+
+class NltkTagger(Tagger):
+    
+    def tag_tokens(self, tokens, single=True):
+        return nltk.pos_tag(tokens)
+
--- a/parsing/Taggers/README.txt
+++ b/parsing/Taggers/README.txt
+In this repo are all files for Gargantext Taggers.
+
+For developers please indicate this path:
+/srv/gargantext_lib/gargantext-taggers/.
+
+Then this repo should be locate in /srv/gargantext_lib
+
+
--- a/parsing/Taggers/Tagger.py
+++ b/parsing/Taggers/Tagger.py
+import re
+
+
+"""This base class is a model for performing tagging in a pipeline fashion.
+When started, it initiates the parser;
+when passed text, the text is piped to the parser.
+When ended, the parser is closed and the tagged word returned
+in a tuple format.
+"""
+class Tagger:
+
+    def __init__(self):
+        # This regular expression is really good at tokenizing a text!
+        self._re_sentence = re.compile(r'''(?x)  # set flag to allow verbose regexps
+            (?:[A-Z])(?:\.[A-Z])+\.?        # abbreviations, e.g. U.S.A.
+            | \w+(?:-\w+)*                  # words with optional internal hyphens
+            | \$?\d+(?:\.\d+)?%?            # currency and percentages, e.g. $12.40, 82%
+            | \.\.\.                        # ellipsis
+            | [][.,;"'?!():-_`]             # these are separate tokens
+            ''', re.UNICODE | re.MULTILINE | re.DOTALL)
+        self.buffer = []
+        self.start()
+        
+    def __del__(self):
+        self.stop()
+    
+    """Initializes the tagger. This method is called by the constructor.
+    This method can be overriden by inherited classes.
+    """
+    def start(self):
+        pass
+    
+    """Ends the tagger.
+    This method is called by the destructor.
+    This method can be overriden by inherited classes.
+    """
+    def stop(self):
+        pass
+
+    """This method is userful in the case of pipelines requiring
+    boundaries around blocks of text.
+    """
+    def tagging_start(self):
+        pass
+        
+    def tagging_end(self):
+        pass
+        
+    """Returns the tagged tokens.
+    This method shall be overriden by inherited classes.
+    Example of input: ['This', 'is', 'not', 'a', 'sentence', '.']
+    Example of output: [('This', 'DT'), ('is', 'VBZ'), ('not', 'RB'), ('a', 'DT'), ('sentence', 'NN'), ('.', '.')]
+    """
+    def tag_tokens(self, tokens, single=True):
+        if single:
+            self.tagging_start()
+        # do something with the tokens here
+        if single:
+            self.tagging_end()
+        return []
+    
+
+    """Send a text to be tagged.
+    """
+    # Not used right now
+    def tag_text(self, text):
+        tokens_tags = []
+        self.tagging_start()
+        for line in text.split('\n'):
+            tokens = self._re_sentence.findall(line)
+            tokens_tags += self.tag_tokens(tokens, False)
+        self.tagging_end()
+        return tokens_tags
--- a/parsing/Taggers/TreeTagger.py
+++ b/parsing/Taggers/TreeTagger.py
+from .Tagger import Tagger
+
+import subprocess
+import threading
+import time
+
+
+# TODO: have a look at "queue" instead of "list" (cf. http://stackoverflow.com/questions/17564804/in-python-how-to-wait-until-only-the-first-thread-is-finished)
+
+class identity_dict(dict):
+    def __missing__(self, key):
+        return key
+    
+_tag_replacements = identity_dict({
+    "NOM": "NN",
+    "NAM": "NN",
+    "ADJ": "NN",
+    "VER": "JJ",
+    "PREP": "PRP",
+    "KON": "CC",
+    "DET": "DT",
+    "PRO": "DT",
+    # Do we also have to take semicolons, comas and other points into account?
+})
+
+def _readOutput(output, buffer):
+    hasStarted = False
+    while True:
+        line = output.readline()
+        if line:
+            if line == b"<block>\n":
+                hasStarted = True
+                continue
+            if line == b"<block/>\n":
+                break
+            if hasStarted:
+                token, tag = line.decode('utf8').split()[:2]
+                tag = _tag_replacements[tag.split(':')[0]]
+                buffer.append((token, tag))
+        else:
+            time.sleep(0.1)
+
+
+"""Use TreeTagger for the tagging.
+Shall be used for french texts.
+"""
+class TreeTagger(Tagger):
+    
+    def start(self, treeTaggerPath = "./parsing/Taggers/treetagger"):
+        binaryFile = "%s/bin/tree-tagger" % treeTaggerPath
+        tagcmdlist = [
+            binaryFile,
+            "%s/lib/french-utf8.par" % treeTaggerPath,
+            "-token",
+            "-lemma",
+            "-sgml",
+            "-quiet"
+        ]
+        self._popen = subprocess.Popen(
+            tagcmdlist,     # Use a list of params in place of a string.
+            bufsize=0,      # Not buffered to retrieve data asap from TreeTagger
+            executable=binaryFile, # As we have it, specify it
+            stdin=subprocess.PIPE,  # Get a pipe to write input data to TreeTagger process
+            stdout=subprocess.PIPE, # Get a pipe to read processing results from TreeTagger
+            stderr=subprocess.PIPE, # Get a pipe to read processing results from TreeTagger
+        )
+        self._input, self._output = self._popen.stdin, self._popen.stdout
+        # self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
+        # self.buffer = OutputBuffer()
+        
+    def stop(self):
+        # terminates the 'treetagger' process
+        try:
+            self._popen.kill()
+            self._popen.terminate()
+        except:
+            pass
+        
+    def tagging_start(self):
+        self.buffer = []
+        self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, ))
+        self._thread.start()
+        self._input.write(b"<block>\n")
+        
+    def tagging_end(self):
+        self._input.write(b"<block/>\n")
+        # sends some dummy tokens, then wait for the text to be treated
+        self.tag_tokens("Les sanglots longs des violons de l ' automne bercent mon coeur d ' une langueur monotone .".split(), False)
+        self._thread.join()
+        
+        
+    def tag_tokens(self, tokens, single=True):
+        if single:
+            self.tagging_start()
+        for token in tokens:
+            self._input.write(bytes(token + "\n", "utf8"))
+        if single:
+            self.tagging_end()
+            return self.buffer
+            
+    def tag_text(self, text):
+        self.tagging_start()
+        for line in text.split('\n'):
+            tokens = self._re_sentence.findall(line)
+            self.tag_tokens(tokens, False)
+        self.tagging_end()
+        return self.buffer
+
--- a/parsing/Taggers/TurboTagger.py
+++ b/parsing/Taggers/TurboTagger.py
+from .Tagger import Tagger
+from .lib.nlpserver.client import NLPClient
+
+
+class TurboTagger:
+
+    def start(self):
+        self._nlpclient = NLPClient()
+
+    def stop(self):
+        if hasattr(self, '_nlpclient'):
+            del self._nlpclient
+
+    def tag_text(self, text):
+        if not hasattr(self, '_nlpclient'):
+            self._nlpclient = NLPClient()
+        tokens_tags = []
+        for sentence in self._nlpclient.tag(text):
+            for token, tag in sentence:
+                tokens_tags.append((token, tag, ))
+        return tokens_tags
--- a/parsing/Taggers/__init__.py
+++ b/parsing/Taggers/__init__.py
+from .Tagger import Tagger
+from .NltkTagger import NltkTagger
+from .TreeTagger import TreeTagger
+from .TurboTagger import TurboTagger
+from .MeltTagger import MeltTagger
--- a/parsing/Taggers/lib
+++ b/parsing/Taggers/lib
+/srv/gargantext_lib/taggers
\ No newline at end of file