diff --git a/gargantext_web/settings.py b/gargantext_web/settings.py index 2ad4d0f4d50fb1bd0aa5860896354363235cf479..c590e3771bd051d6f0b9f95990532c8b9840894d 100644 --- a/gargantext_web/settings.py +++ b/gargantext_web/settings.py @@ -50,7 +50,6 @@ SECRET_KEY = 'bt)3n9v&a02cu7^^=+u_t2tmn8ex5fvx8$x4r*j*pb1yawd+rz' # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True - MAINTENANCE = False TEMPLATE_DEBUG = False @@ -71,17 +70,17 @@ TEMPLATE_DIRS = ( #ALLOWED_HOSTS = ['*',] -ALLOWED_HOSTS = ['localhost', - 'gargantext.org', - 'stable.gargantext.org', - 'dev.gargantext.org', - 'iscpif.gargantext.org', - 'gargantext.iscpif.fr', - 'mines.gargantext.org', - 'pasteur.gargantext.org', - 'beta.gargantext.org', - 'garg-dev.iscpif.fr', - 'garg-stable.iscpif.fr', +ALLOWED_HOSTS = ['localhost', + 'gargantext.org', + 'stable.gargantext.org', + 'dev.gargantext.org', + 'iscpif.gargantext.org', + 'gargantext.iscpif.fr', + 'mines.gargantext.org', + 'pasteur.gargantext.org', + 'beta.gargantext.org', + 'garg-dev.iscpif.fr', + 'garg-stable.iscpif.fr', ] diff --git a/parsing/FileParsers/FileParser.py b/parsing/FileParsers/FileParser.py index effa5167ad334c5a91747a15c9487d0e1b0f51e5..f65d42561209dc819e63046ed33cc97d60830bcd 100644 --- a/parsing/FileParsers/FileParser.py +++ b/parsing/FileParsers/FileParser.py @@ -34,17 +34,19 @@ class FileParser: """ # First, check the split dates... + # This part mainly deal with Zotero data but can be usefull for others + # parts date_string = hyperdata.get('publication_date_to_parse', None) if date_string is not None: - date_string = re.sub(r'\/\/+', '', date_string) - date_string = re.sub(r'undefined', '', date_string) + date_string = re.sub(r'\/\/+(\w*|\d*)', '', date_string) + #date_string = re.sub(r'undefined', '', date_string) try: hyperdata['publication' + "_date"] = dateutil.parser.parse( date_string, default=DEFAULT_DATE ).strftime("%Y-%m-%d %H:%M:%S") - except: - print('Parser Zotero, Date not parsed for:', date_string) + except Exception as error: + print(error, 'Parser Zotero, Date not parsed for:', date_string) hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") diff --git a/parsing/Taggers/MeltTagger.py b/parsing/Taggers/MeltTagger.py new file mode 100644 index 0000000000000000000000000000000000000000..c92eee7629e089d5a8bfae8704d96c2a8c179785 --- /dev/null +++ b/parsing/Taggers/MeltTagger.py @@ -0,0 +1,113 @@ +from .Tagger import Tagger +from .lib.melttagger.tagger import POSTagger, Token, DAGParser, DAGReader + +import subprocess +import sys +import os + + +# references for tag equivalents: +# - http://cs.nyu.edu/grishman/jet/guide/PennPOS.html +# - http://www.lattice.cnrs.fr/sites/itellier/SEM.html +class identity_dict(dict): + def __missing__(self, key): + return key +_tag_replacements = identity_dict({ + 'DET': 'DT', + 'NC': 'NN', + 'NPP': 'NNP', + 'ADJ': 'JJ', + 'PONCT': '.', + 'ADVWH': 'WRB', + 'ADV': 'RB', + 'DETWH': 'WDT', + 'PROWH': 'WP', + 'ET': 'FW', + 'VINF': 'VB', + 'I': 'UH', + 'CS': 'IN', + + # 'CLS': '', + # 'CLR': '', + # 'CLO': '', + + # 'PRO': '', + # 'PROREL': '', + # 'P': '', + # 'P+D': '', + # 'P+PRO': '', + + # 'V': '', + # 'VPR': '', + # 'VPP': '', + # 'VS': '', + # 'VIMP': '', + + # 'PREF': '', + # 'ADJWH': '', +}) + + +class MeltTagger(Tagger): + + def start(self, language='fr', melt_data_path='lib/melttagger'): + basepath = os.path.dirname(os.path.realpath(__file__)) + path = os.path.join(basepath, melt_data_path) + self._pos_tagger = POSTagger() + self._pos_tagger.load_tag_dictionary('%s/%s/tag_dict.json' % (path, language)) + self._pos_tagger.load_lexicon('%s/%s/lexicon.json' % (path, language)) + self._pos_tagger.load_model('%s/%s' % (path, language)) + self._preprocessing_commands = ( + # ('/usr/local/bin/clean_noisy_characters.sh', ), + ('%s/MElt_normalizer.pl' % path, '-nc', '-c', '-d', '%s/%s' % (path, language), '-l', language, ), + ('%s/segmenteur.pl' % path, '-a', '-ca', '-af=%s/pctabr' % path, '-p', 'r'), + ) + self._lemmatization_commands = ( + ('%s/MElt_postprocess.pl' % path, '-npp', '-l', language), + ('%s/MElt_lemmatizer.pl' % path, '-m', '%s/%s' % (path, language)), + ) + + def stop(self): + pass + + def _pipe(self, text, commands, encoding='utf8'): + text = text.encode(encoding) + for command in commands: + process = subprocess.Popen( + command, + bufsize=0, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + text, err = process.communicate(text) + if len(err): + print(err.decode(encoding), file=sys.stderr) + return text.decode(encoding) + + def _tag(self, text): + preprocessed = self._pipe(text, self._preprocessing_commands) + for sentence in preprocessed.split('\n'): + words = sentence.split(' ') + tokens = [Token(word) for word in words] + tagged_tokens = self._pos_tagger.tag_token_sequence(tokens) + for token in tagged_tokens: + if len(token.string): + yield (token.string, _tag_replacements[token.label], ) + + def tag_text(self, text, lemmatize=True): + tagged_tokens = self._tag(text) + if not lemmatize: + for tagged_token in tagged_tokens: + yield tagged_token + return + # lemmatization + command_input = ' '.join( + '%s/%s' % (token, tag) + for token, tag in tagged_tokens + ) + lemmatized = self._pipe(command_input, self._lemmatization_commands) + for token in lemmatized.split(): + if len(token): + values = token.split('/') + yield (values[0], values[1], values[2].replace('*', '')) diff --git a/parsing/Taggers/README.txt b/parsing/Taggers/README.txt new file mode 100644 index 0000000000000000000000000000000000000000..740dfcd088da7e269b842b9bad2631b2515a5a53 --- /dev/null +++ b/parsing/Taggers/README.txt @@ -0,0 +1,8 @@ +In this repo are all files for Gargantext Taggers. + +For developers please indicate this path: +/srv/gargantext_lib/gargantext-taggers/. + +Then this repo should be locate in /srv/gargantext_lib + + diff --git a/parsing/Taggers/TreeTagger.py b/parsing/Taggers/TreeTagger.py index fd0133e5b41079c8437ceb82d09f77ee1dd9a6f0..f4492cb09d6c207b49db90cb432af619f9b2b1a0 100644 --- a/parsing/Taggers/TreeTagger.py +++ b/parsing/Taggers/TreeTagger.py @@ -10,7 +10,7 @@ import time class identity_dict(dict): def __missing__(self, key): return key - + _tag_replacements = identity_dict({ "NOM": "NN", "NAM": "NN", @@ -45,8 +45,8 @@ def _readOutput(output, buffer): Shall be used for french texts. """ class TreeTagger(Tagger): - - def start(self, treeTaggerPath = "./parsing/Taggers/treetagger"): + + def start(self, treeTaggerPath = "./parsing/Taggers/lib/treetagger"): binaryFile = "%s/bin/tree-tagger" % treeTaggerPath tagcmdlist = [ binaryFile, @@ -67,7 +67,7 @@ class TreeTagger(Tagger): self._input, self._output = self._popen.stdin, self._popen.stdout # self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start() # self.buffer = OutputBuffer() - + def stop(self): # terminates the 'treetagger' process try: @@ -75,20 +75,20 @@ class TreeTagger(Tagger): self._popen.terminate() except: pass - + def tagging_start(self): self.buffer = [] self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )) self._thread.start() self._input.write(b"<block>\n") - + def tagging_end(self): self._input.write(b"<block/>\n") # sends some dummy tokens, then wait for the text to be treated self.tag_tokens("Les sanglots longs des violons de l ' automne bercent mon coeur d ' une langueur monotone .".split(), False) self._thread.join() - - + + def tag_tokens(self, tokens, single=True): if single: self.tagging_start() @@ -97,7 +97,7 @@ class TreeTagger(Tagger): if single: self.tagging_end() return self.buffer - + def tag_text(self, text): self.tagging_start() for line in text.split('\n'): diff --git a/parsing/Taggers/TurboTagger.py b/parsing/Taggers/TurboTagger.py index 5a730e9432cee72b5e87455c6244dbb9cf05dee5..dc362db1498f2f58ff0176e245441c2ff7cb456b 100644 --- a/parsing/Taggers/TurboTagger.py +++ b/parsing/Taggers/TurboTagger.py @@ -1,9 +1,9 @@ from .Tagger import Tagger -from .nlpserver.client import NLPClient +from .lib.nlpserver.client import NLPClient class TurboTagger: - + def start(self): self._nlpclient = NLPClient() diff --git a/parsing/Taggers/__init__.py b/parsing/Taggers/__init__.py index 3f8167d4700b371da88dada8dc66d3b39601582c..3e979b8b54806f0cf723f45ff253d4103c8cc6a6 100644 --- a/parsing/Taggers/__init__.py +++ b/parsing/Taggers/__init__.py @@ -2,3 +2,4 @@ from .Tagger import Tagger from .NltkTagger import NltkTagger from .TreeTagger import TreeTagger from .TurboTagger import TurboTagger +from .MeltTagger import MeltTagger diff --git a/parsing/Taggers/lib/melttagger b/parsing/Taggers/lib/melttagger new file mode 120000 index 0000000000000000000000000000000000000000..57680a28f55eebb425c25a14d479dfe7db41a0b5 --- /dev/null +++ b/parsing/Taggers/lib/melttagger @@ -0,0 +1 @@ +/srv/gargantext_lib/taggers/melttagger \ No newline at end of file diff --git a/parsing/Taggers/nlpserver/README.md b/parsing/Taggers/lib/nlpserver/README.md similarity index 100% rename from parsing/Taggers/nlpserver/README.md rename to parsing/Taggers/lib/nlpserver/README.md diff --git a/parsing/Taggers/nlpserver/client.py b/parsing/Taggers/lib/nlpserver/client.py similarity index 100% rename from parsing/Taggers/nlpserver/client.py rename to parsing/Taggers/lib/nlpserver/client.py diff --git a/parsing/Taggers/lib/nlpserver/data b/parsing/Taggers/lib/nlpserver/data new file mode 120000 index 0000000000000000000000000000000000000000..7de560ffd55406b6ea7e7d35a090267c20b62391 --- /dev/null +++ b/parsing/Taggers/lib/nlpserver/data @@ -0,0 +1 @@ +/srv/gargantext_lib/taggers/nlpserver/data \ No newline at end of file diff --git a/parsing/Taggers/nlpserver/lemmatizer.py b/parsing/Taggers/lib/nlpserver/lemmatizer.py similarity index 100% rename from parsing/Taggers/nlpserver/lemmatizer.py rename to parsing/Taggers/lib/nlpserver/lemmatizer.py diff --git a/parsing/Taggers/nlpserver/nlpserver b/parsing/Taggers/lib/nlpserver/nlpserver similarity index 100% rename from parsing/Taggers/nlpserver/nlpserver rename to parsing/Taggers/lib/nlpserver/nlpserver diff --git a/parsing/Taggers/nlpserver/pipeline.py b/parsing/Taggers/lib/nlpserver/pipeline.py similarity index 100% rename from parsing/Taggers/nlpserver/pipeline.py rename to parsing/Taggers/lib/nlpserver/pipeline.py diff --git a/parsing/Taggers/nlpserver/server.py b/parsing/Taggers/lib/nlpserver/server.py similarity index 100% rename from parsing/Taggers/nlpserver/server.py rename to parsing/Taggers/lib/nlpserver/server.py diff --git a/parsing/Taggers/nlpserver/settings.py b/parsing/Taggers/lib/nlpserver/settings.py similarity index 100% rename from parsing/Taggers/nlpserver/settings.py rename to parsing/Taggers/lib/nlpserver/settings.py diff --git a/parsing/Taggers/lib/nlpserver/test.py b/parsing/Taggers/lib/nlpserver/test.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/parsing/Taggers/lib/nlpserver/turboparser.cpython-34m.so b/parsing/Taggers/lib/nlpserver/turboparser.cpython-34m.so new file mode 120000 index 0000000000000000000000000000000000000000..6a685808cf25d910e4f4df750d063409de98d954 --- /dev/null +++ b/parsing/Taggers/lib/nlpserver/turboparser.cpython-34m.so @@ -0,0 +1 @@ +/srv/gargantext_lib/taggers/nlpserver/turboparser.cpython-34m.so \ No newline at end of file diff --git a/parsing/Taggers/lib/treetagger b/parsing/Taggers/lib/treetagger new file mode 120000 index 0000000000000000000000000000000000000000..eaf73fb3b4d32090e18c0fd4295fb264cffb5c6b --- /dev/null +++ b/parsing/Taggers/lib/treetagger @@ -0,0 +1 @@ +/srv/gargantext_lib/taggers/treetagger \ No newline at end of file diff --git a/parsing/Taggers/nlpserver/data/.gitignore b/parsing/Taggers/nlpserver/data/.gitignore deleted file mode 100644 index cd4a96a74b1a0d0d79f88e9c75324a2988df6530..0000000000000000000000000000000000000000 --- a/parsing/Taggers/nlpserver/data/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.model \ No newline at end of file diff --git a/parsing/Taggers/nlpserver/data/english.pickle b/parsing/Taggers/nlpserver/data/english.pickle deleted file mode 100644 index ca5a7ccd0386046def7c1591816321249436b629..0000000000000000000000000000000000000000 Binary files a/parsing/Taggers/nlpserver/data/english.pickle and /dev/null differ diff --git a/parsing/Taggers/nlpserver/turboparser.cpython-34m.so b/parsing/Taggers/nlpserver/turboparser.cpython-34m.so deleted file mode 100755 index 231db1ac8f43a5c69da74ff71f3161385258b3a5..0000000000000000000000000000000000000000 Binary files a/parsing/Taggers/nlpserver/turboparser.cpython-34m.so and /dev/null differ diff --git a/parsing/Taggers/treetagger b/parsing/Taggers/treetagger deleted file mode 120000 index dbcd60dc9c4c4dc9f3e2a0f744aab72fe82ee74b..0000000000000000000000000000000000000000 --- a/parsing/Taggers/treetagger +++ /dev/null @@ -1 +0,0 @@ -/srv/gargantext_lib/treetagger \ No newline at end of file diff --git a/templates/corpus.html b/templates/corpus.html index 05d82da35213d14e1c923a333b8b2ef9b1df8815..b2fe8c8a4fd779c7947b93894a66bbd1548ded33 100644 --- a/templates/corpus.html +++ b/templates/corpus.html @@ -149,7 +149,8 @@ th a { <div class="col-md-4"> <div class="jumbotron"> - <h3><a href="/project/{{project.id}}/corpus/{{corpus.id}}/matrix">Matrix</a></h3> + <!-- <h3><a href="/project/{{project.id}}/corpus/{{corpus.id}}/matrix">Matrix</a></h3> --> + <h3>Matrix (soon)</h3> <ol> <li>Sort</li> <li>Group</li> diff --git a/test-melt.py b/test-melt.py new file mode 100644 index 0000000000000000000000000000000000000000..717309ec25ca33c9b9c7a9114e6a03b845793995 --- /dev/null +++ b/test-melt.py @@ -0,0 +1,140 @@ +from parsing.Taggers import MeltTagger + + +# from parsing.Taggers.melttagger.tagger import POSTagger, Token, DAGParser, DAGReader + + +# # references: +# # - http://cs.nyu.edu/grishman/jet/guide/PennPOS.html +# # - http://www.lattice.cnrs.fr/sites/itellier/SEM.html +# class identity_dict(dict): +# def __missing__(self, key): +# return key +# _tag_replacements = identity_dict({ +# 'DET': 'DT', +# 'NC': 'NN', +# 'NPP': 'NNP', +# 'ADJ': 'JJ', +# 'PONCT': '.', +# 'ADVWH': 'WRB', +# 'ADV': 'RB', +# 'DETWH': 'WDT', +# 'PROWH': 'WP', +# 'ET': 'FW', +# 'VINF': 'VB', +# 'I': 'UH', +# 'CS': 'IN', + +# # 'CLS': '', +# # 'CLR': '', +# # 'CLO': '', + +# # 'PRO': '', +# # 'PROREL': '', +# # 'P': '', +# # 'P+D': '', +# # 'P+PRO': '', + +# # 'V': '', +# # 'VPR': '', +# # 'VPP': '', +# # 'VS': '', +# # 'VIMP': '', + +# # 'PREF': '', +# # 'ADJWH': '', +# }) + + +# import subprocess + + +# class MeltTagger: + +# def __init__(self, language='fr', melt_data_path='./parsing/Taggers/melttagger'): +# path = '%s/%s' % (melt_data_path, language) +# self.pos_tagger = POSTagger() +# self.pos_tagger.load_tag_dictionary('%s/tag_dict.json' % path) +# self.pos_tagger.load_lexicon('%s/lexicon.json' % path) +# self.pos_tagger.load_model('%s' % path) +# self._preprocessing_commands = ( +# # ('/usr/local/bin/clean_noisy_characters.sh', ), +# # ('/usr/local/bin/MElt_normalizer.pl', '-nc', '-c', '-d', '/usr/local/share/melt/normalization/%s' % language, '-l', language, ), +# ('/usr/local/share/melt/segmenteur.pl', '-a', '-ca', '-af=/usr/local/share/melt/pctabr', '-p', 'r'), +# ) +# self._lemmatization_commands = ( +# ('/usr/local/bin/MElt_postprocess.pl', '-npp', '-l', language), +# ('MElt_lemmatizer.pl', '-m', '/usr/local/share/melt/%s' % language), +# ) + +# def pipe(self, text, commands, encoding='utf8'): +# text = text.encode(encoding) +# # print(text.decode(encoding)) +# for command in commands: +# # print(command) +# process = subprocess.Popen( +# command, +# bufsize=0, +# stdin=subprocess.PIPE, +# stdout=subprocess.PIPE, +# stderr=subprocess.PIPE, +# ) +# text, err = process.communicate(text) +# # print() +# # print(text.decode(encoding)) +# if len(err): +# print(err.decode(encoding)) +# return text.decode(encoding) + +# def tag(self, text, encoding='utf8', lemmatize=True): +# preprocessed = self.pipe(text, self._preprocessing_commands) +# if lemmatize: +# result = '' +# for sentence in preprocessed.split('\n'): +# words = sentence.split(' ') +# tokens = [Token(word) for word in words] +# tagged_tokens = self.pos_tagger.tag_token_sequence(tokens) +# # result += ' '.join(token.__str__() for token in tagged_tokens) +# for token in tagged_tokens: +# if len(token.string): +# result += '%s/%s ' % (token.string, token.label, ) +# result += '\n' +# lemmatized = self.pipe(result, self._lemmatization_commands) +# for sentence in lemmatized.split('\n'): +# for token in sentence.split(' '): +# if len(token): +# yield tuple(token.split('/')) +# else: +# for sentence in preprocessed.split('\n'): +# words = sentence.split(' ') +# tokens = [Token(word) for word in words] +# tagged_tokens = self.pos_tagger.tag_token_sequence(tokens) +# for token in tagged_tokens: +# if len(token.string): +# yield (token.string, _tag_replacements[token.label], ) + + +if __name__ == '__main__': + from time import time + t0 = time() + tagger = MeltTagger() + print(time() - t0) + print() + text = """Le vieil hôtel de ville, construit de 1608 à 1610 est le plus ancien bâtiment de la ville de Wiesbaden. Il se dresse sur la place centrale de la vieille ville, la Place du Palais, qui abrite aujourd'hui le Parlement de l'État de Hesse, l'église et l'hôtel de ville. + Il a été construit dans le style Renaissance. On a ajouté, en 1828, un étage de style romantique historié. Sur les bas-reliefs des cinq fenêtres de l'étage, en bois, étaient représentées les vertus de la force, la justice, la charité, de prudence et de modération, alors que la pierre a remplacé par des copies. Le pièces de chêne d'origine peut être visitées aujourd'hui au Musée de Wiesbaden. Aujourd'hui, le bâtiment sert de bureau de la ville de Wiesbaden. + Devant le porche, entre l'hôtel de Ville et l'Ancien hôtel de ville, se trouve la colonne centrale de Nassau, un lion couronné avec bouclier. + Il s'agit de construire progressivement, à partir des données initiales, un sous-graphe dans lequel sont classés les différents sommets par ordre croissant de leur distance minimale au sommet de départ. La distance correspond à la somme des poids des arêtes empruntées. + Au départ, on considère que les distances de chaque sommet au sommet de départ sont infinies. Au cours de chaque itération, on va mettre à jour les distances des sommets reliés par un arc au dernier du sous-graphe (en ajoutant le poids de l'arc à la distance séparant ce dernier sommet du sommet de départ ; si la distance obtenue ainsi est supérieure à celle qui précédait, la distance n'est cependant pas modifiée). Après cette mise à jour, on examine l'ensemble des sommets qui ne font pas partie du sous-graphe, et on choisit celui dont la distance est minimale pour l'ajouter au sous-graphe. + La première étape consiste à mettre de côté le sommet de départ et à lui attribuer une distance de 0. Les sommets qui lui sont adjacents sont mis à jour avec une valeur égale au poids de l'arc qui les relie au sommet de départ (ou à celui de poids le plus faible si plusieurs arcs les relient) et les autres sommets conservent leur distance infinie. + Le plus proche des sommets adjacents est alors ajouté au sous-graphe. + La seconde étape consiste à mettre à jour les distances des sommets adjacents à ce dernier. Encore une fois, on recherche alors le sommet doté de la distance la plus faible. Comme tous les sommets n'avaient plus une valeur infinie, il est donc possible que le sommet choisi ne soit pas un des derniers mis à jour. + On l'ajoute au sous-graphe, puis on continue ainsi à partir du dernier sommet ajouté, jusqu'à épuisement des sommets ou jusqu'à sélection du sommet d'arrivée. + """ + i = 0 + t0 = time() + for x in tagger.tag_text(text, lemmatize=True): + print(x) + i += 1 + t = time() - t0 + print(t) + print(t / i)