Commit 298679e1 authored by delanoe's avatar delanoe

Merge remote-tracking branch 'origin/romain-testing-taggers-parsers' into testing-merge

parents 427c107d 5daefcf6
"""
Creates MD5 hashes (used for unique filepaths)
NB this module could go inside util.files
"""
import hashlib import hashlib
import binascii import binascii
def digest(value, algorithm='md5'): def digest(value, algorithm='md5'):
"""
Ex: b'm\x00\x07\xe5/z\xfb}Z\x06P\xb0\xff\xb8\xa4\xd1'
(16 bytes ranging from 0 to ff)
"""
m = hashlib.new(algorithm) m = hashlib.new(algorithm)
m.update(value) m.update(value)
return m.digest() return m.digest()
def str_digest(value, algorithm='md5'): def str_digest(value, algorithm='md5'):
"""
Ex: 6d0007e52f7afb7d5a0650b0ffb8a4d1
(32 hex chars)
"""
return binascii.hexlify(digest(value, algorithm)).decode() return binascii.hexlify(digest(value, algorithm)).decode()
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
""" """
__author__ = "Gargantext Team" __author__ = "Gargantext Team"
__copyright__ = "Copyright 2014-15 ISCPIF-CNRS" __copyright__ = "Copyright 2014-16 ISCPIF-CNRS"
__version__ = "0.1" __version__ = "0.2"
__email__ = "romain.loth@iscpif.fr" __email__ = "romain.loth@iscpif.fr"
__status__ = "Test" __status__ = "Test"
...@@ -27,6 +27,7 @@ import sys ...@@ -27,6 +27,7 @@ import sys
#from admin.env import * #from admin.env import *
#from parsing.FileParsers.FileParser import FileParser #from parsing.FileParsers.FileParser import FileParser
from ._Parser import Parser from ._Parser import Parser
from traceback import format_tb
class EuropresseParser(Parser): class EuropresseParser(Parser):
...@@ -112,15 +113,15 @@ class EuropresseParser(Parser): ...@@ -112,15 +113,15 @@ class EuropresseParser(Parser):
# parse all the articles, one by one # parse all the articles, one by one
try: for html_article in html_articles:
for html_article in html_articles: try:
print("==============================new article")
# s'il n'y a pas du tout de header on doit skip # s'il n'y a pas du tout de header on doit skip
all_header = html_article.xpath(entire_header_xpath) all_header = html_article.xpath(entire_header_xpath)
if len(all_header) == 0: all_header_text = " ".join(scrap_text(all_header))
print("WARNING: europress (skip) article without header") if len(all_header) == 0 or len(all_header_text) == 0:
hyperdata['error']="Europresse: html doc with no header"
yield(hyperdata)
print("WARNING: europresse (skip) article without header")
continue continue
hyperdata = {} hyperdata = {}
...@@ -134,7 +135,9 @@ class EuropresseParser(Parser): ...@@ -134,7 +135,9 @@ class EuropresseParser(Parser):
hyperdata['title'] = title[0] hyperdata['title'] = title[0]
except: except:
# il y aura un problème d'affichage si pas de titre ! # il y aura un problème d'affichage si pas de titre !
print("WARNING: europress (skip) article without title") print("WARNING: europresse (skip) article without title")
hyperdata['error']="Europresse: doc with no title"
yield(hyperdata)
continue continue
...@@ -189,12 +192,19 @@ class EuropresseParser(Parser): ...@@ -189,12 +192,19 @@ class EuropresseParser(Parser):
else: else:
# occasionellment DocHeader absent # occasionellment DocHeader absent
# (on se rabat sur le header entier) # (on se rabat sur le header entier)
search_text = " ".join(scrap_text(all_header[0])) search_text = all_header_text
# print("---using all header: '%s'" % search_text) # print("---using all header: '%s'" % search_text)
# si on n'a pas trouvé de zone du tout
if not search_text:
the_err = "europresse (skip) doc without detailed header"
print("WARNING:" + the_err)
hyperdata['error']= the_err
yield(hyperdata)
continue
# on poursuit date/langue avec la zone obtenue # on poursuit date/langue avec la zone obtenue...
# 1) Une REGEXP identifie la langue ET attrape la date # 1) Une REGEXP identifie la langue ET attrape la date
test_date_fr = re.search(format_date_fr,search_text) test_date_fr = re.search(format_date_fr,search_text)
...@@ -223,7 +233,7 @@ class EuropresseParser(Parser): ...@@ -223,7 +233,7 @@ class EuropresseParser(Parser):
# match str # match str
date_str = test_date_en.group() date_str = test_date_en.group()
else: else:
print("WARNING europress: echec diagnostic date/langue header sur '%s'" % header) print("WARNING europresse: echec diagnostic date/langue header sur '%s'" % header)
# default lg value, used locally, not saved # default lg value, used locally, not saved
doc_language = 'en' doc_language = 'en'
# default date value, will be saved # default date value, will be saved
...@@ -260,8 +270,12 @@ class EuropresseParser(Parser): ...@@ -260,8 +270,12 @@ class EuropresseParser(Parser):
# most probably news_topic before beginning of date # most probably news_topic before beginning of date
hyperdata['rubrique'] = header_elts[0] hyperdata['rubrique'] = header_elts[0]
print(hyperdata) # print(hyperdata)
yield hyperdata yield hyperdata
except Exception as err: except Exception as err:
print('Europresse parser: Something bad happened:' + str(err)) print('WARNING: europresse (skip) unknown error:"' + str(err) + '"'
+ "\n>>>" + (">>>".join(format_tb(err.__traceback__))))
hyperdata['error']= err
yield(hyperdata)
continue
...@@ -27,7 +27,6 @@ class ISTexParser(Parser): ...@@ -27,7 +27,6 @@ class ISTexParser(Parser):
} }
suma = 0 suma = 0
print(len(json_docs))
for json_doc in json_docs: for json_doc in json_docs:
hyperdata = {} hyperdata = {}
......
from ._Parser import Parser from ._Parser import Parser
from gargantext.util.languages import languages from gargantext.util.languages import languages
from re import match
class RISParser(Parser): class RISParser(Parser):
...@@ -11,11 +12,15 @@ class RISParser(Parser): ...@@ -11,11 +12,15 @@ class RISParser(Parser):
"AU": {"type": "hyperdata", "key": "authors", "separator": "\n"}, "AU": {"type": "hyperdata", "key": "authors", "separator": "\n"},
"T2": {"type": "hyperdata", "key": "journal"}, "T2": {"type": "hyperdata", "key": "journal"},
"UR": {"type": "hyperdata", "key": "doi"}, "UR": {"type": "hyperdata", "key": "doi"},
# RIS format specifications: PY is not only year but YYYY/MM/DD with MM and DD optional
# cf. https://en.wikipedia.org/wiki/RIS_(file_format)
"PY": {"type": "hyperdata", "key": "publication_year"}, "PY": {"type": "hyperdata", "key": "publication_year"},
"PD": {"type": "hyperdata", "key": "publication_month"}, "PD": {"type": "hyperdata", "key": "publication_month"},
"N1": {"type": "hyperdata", "key": "references", "separator": ", "}, "N1": {"type": "hyperdata", "key": "references", "separator": ", "},
"LA": {"type": "hyperdata", "key": "language_iso2"}, "LA": {"type": "hyperdata", "key": "language_iso2"},
"A": {"type": "hyperdata", "key": "abstract", "separator": " "}, "AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
"WC": {"type": "hyperdata", "key": "fields"}, "WC": {"type": "hyperdata", "key": "fields"},
} }
...@@ -44,7 +49,11 @@ class RISParser(Parser): ...@@ -44,7 +49,11 @@ class RISParser(Parser):
# 1 - we record the previous value array... # 1 - we record the previous value array...
if parameter["type"] == "hyperdata": if parameter["type"] == "hyperdata":
separator = parameter["separator"] if "separator" in parameter else "" separator = parameter["separator"] if "separator" in parameter else ""
hyperdata[parameter["key"]] = separator.join(last_values) final_value = separator.join(last_values)
if last_key != 'PY':
hyperdata[parameter["key"]] = final_value
else:
hyperdata = PY_values_decompose_and_save(final_value, hyperdata)
#... or even finish the record (rare here, most often after empty line) #... or even finish the record (rare here, most often after empty line)
elif parameter["type"] == "delimiter": elif parameter["type"] == "delimiter":
...@@ -83,7 +92,11 @@ class RISParser(Parser): ...@@ -83,7 +92,11 @@ class RISParser(Parser):
parameter = self._parameters[last_key] parameter = self._parameters[last_key]
if parameter["type"] == "hyperdata": if parameter["type"] == "hyperdata":
separator = parameter["separator"] if "separator" in parameter else "" separator = parameter["separator"] if "separator" in parameter else ""
hyperdata[parameter["key"]] = separator.join(last_values) final_value = separator.join(last_values)
if last_key != 'PY':
hyperdata[parameter["key"]] = final_value
else:
hyperdata = PY_values_decompose_and_save(final_value, hyperdata)
# if a hyperdata object is left in memory, yield it as well # if a hyperdata object is left in memory, yield it as well
if hyperdata: if hyperdata:
...@@ -92,3 +105,51 @@ class RISParser(Parser): ...@@ -92,3 +105,51 @@ class RISParser(Parser):
if 'language_iso2' not in hyperdata.keys(): if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en' hyperdata['language_iso2'] = 'en'
yield hyperdata yield hyperdata
# helper function for PY dates
def PY_values_decompose_and_save(ris_date_str, hyperdata):
"""
PY is associated to our publication_year, but the exact format is:
"YYYY/MM/DD/" (with MM and DD optional)
exemple contents:
1948/07/01
1948/07/01/
1948/07//
1948//
1948
=> This function does the necessary additional date subparsing
and saves the results in the 3 hyperdata slots: year, month, day
"""
possible_fields = ['publication_year',
'publication_month',
'publication_day',
None]
current_field_i = 0
buffr = ""
for char in ris_date_str:
if char != '/':
# continue reading
buffr += char
else:
# on '/' => we save and shift to next field
current_field = possible_fields[current_field_i]
if len(buffr):
hyperdata[current_field] = buffr
# prepare for next time
current_field_i += 1
buffr = ""
# save at the end too
current_field = possible_fields[current_field_i]
if len(buffr):
hyperdata[current_field] = buffr
# return updated meta
return hyperdata
...@@ -57,9 +57,15 @@ class Parser: ...@@ -57,9 +57,15 @@ class Parser:
elif hyperdata.get('publication_year', None) is not None: elif hyperdata.get('publication_year', None) is not None:
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"] prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
# eg prefixes : ['publication']
for prefix in prefixes: for prefix in prefixes:
date_string = hyperdata[prefix + "_year"] date_string = hyperdata[prefix + "_year"]
# FIXME: except for year is it necessary to test that key exists
# when we have a default value in .get(key, "01") ??
key = prefix + "_month" key = prefix + "_month"
if key in hyperdata: if key in hyperdata:
date_string += " " + hyperdata.get(key, "01") date_string += " " + hyperdata.get(key, "01")
...@@ -79,19 +85,19 @@ class Parser: ...@@ -79,19 +85,19 @@ class Parser:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S") hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except Exception as error: except Exception as error:
try: try:
print(error, date_string) print("_Parser: error in full date parse", error, date_string)
# Date format: 1994 NOV-DEC # Date format: 1994 NOV-DEC
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:8]).strftime("%Y-%m-%d %H:%M:%S") hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:8]).strftime("%Y-%m-%d %H:%M:%S")
except Exception as error: except Exception as error:
try: try:
print("error line 93", error) print("_Parser: error in short date parse", error)
# FIXME Date format: 1994 SPR # FIXME Date format: 1994 SPR
# By default, we take the year only # By default, we take the year only
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4]).strftime("%Y-%m-%d %H:%M:%S") hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4]).strftime("%Y-%m-%d %H:%M:%S")
except Exception as error: except Exception as error:
print("error line 99", error) print("_Parser:", error)
else: else:
print("WARNING: Date unknown at _Parser level, using now()") print("WARNING: Date unknown at _Parser level, using now()")
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
......
from .MeltTagger import MeltTagger, _tag_replacements
class EnglishMeltTagger(MeltTagger):
def __init__(self, *args, **kwargs):
MeltTagger.__init__(self, *args, **kwargs)
self.language = 'en'
self._tag_replacements = _tag_replacements['en']
from .MeltTagger import MeltTagger, _tag_replacements
class FrenchMeltTagger(MeltTagger):
def __init__(self, *args, **kwargs):
MeltTagger.__init__(self, *args, **kwargs)
self.language = 'fr'
self._tag_replacements = _tag_replacements['fr']
...@@ -55,7 +55,7 @@ class MeltTagger(Tagger): ...@@ -55,7 +55,7 @@ class MeltTagger(Tagger):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.language = kwargs.pop('language', 'fr') self.language = kwargs.pop('language', 'fr')
self._tag_replacements = _tag_replacements[self.language] self._tag_replacements = _tag_replacements[self.language]
super(self.__class__, self).__init__(*args, **kwargs) Tagger.__init__(self, *args, **kwargs)
def start(self, melt_data_path='lib/melttagger'): def start(self, melt_data_path='lib/melttagger'):
language = self.language language = self.language
...@@ -102,7 +102,8 @@ class MeltTagger(Tagger): ...@@ -102,7 +102,8 @@ class MeltTagger(Tagger):
if len(token.string): if len(token.string):
yield (token.string, token.label, ) yield (token.string, token.label, )
def extract(self, text, lemmatize=False): def tag_text(self, text, lemmatize=False):
# print("IN MeltTagger.tag_text()")
tagged_tokens = self._tag(text) tagged_tokens = self._tag(text)
if not lemmatize: if not lemmatize:
# without lemmatization # without lemmatization
...@@ -123,10 +124,14 @@ class MeltTagger(Tagger): ...@@ -123,10 +124,14 @@ class MeltTagger(Tagger):
yield (values[0], self._tag_replacements[values[1]], values[2].replace('*', '')) yield (values[0], self._tag_replacements[values[1]], values[2].replace('*', ''))
def EnglishMeltTagger(*args, **kwargs): # 2016-09-02: these two constructors go outside
kwargs['language'] = 'en' # to respect the new tagger import
return MeltTagger(*args, **kwargs) # mecanism (1 tagger <=> 1 module)
def FrenchMeltTagger(*args, **kwargs): # def EnglishMeltTagger(*args, **kwargs):
kwargs['language'] = 'fr' # kwargs['language'] = 'en'
return MeltTagger(*args, **kwargs) # return MeltTagger(*args, **kwargs)
#
# def FrenchMeltTagger(*args, **kwargs):
# kwargs['language'] = 'fr'
# return MeltTagger(*args, **kwargs)
...@@ -19,8 +19,7 @@ class Tagger: ...@@ -19,8 +19,7 @@ class Tagger:
| [][.,;"'?!():-_`] # these are separate tokens | [][.,;"'?!():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL) ''', re.UNICODE | re.MULTILINE | re.DOTALL)
self.buffer = [] self.buffer = []
self.start()
#self.start()
def clean_text(self, text): def clean_text(self, text):
...@@ -33,6 +32,7 @@ class Tagger: ...@@ -33,6 +32,7 @@ class Tagger:
self.text = self.clean_text(text) self.text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule) grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self.tag_text(self.text)) tagged_tokens = list(self.tag_text(self.text))
# print("the tagged_tokens", tagged_tokens)
if len(tagged_tokens): if len(tagged_tokens):
grammar_parsed = grammar.parse(tagged_tokens) grammar_parsed = grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees(): for subtree in grammar_parsed.subtrees():
...@@ -46,12 +46,11 @@ class Tagger: ...@@ -46,12 +46,11 @@ class Tagger:
self.stop() self.stop()
def start(self): def start(self):
"""Initializes the tagger. """Initializes the tagger (once per corpus).
This method is called by the constructor, and can be overriden by This method is called by the constructor, and can be overriden by
inherited classes. inherited classes.
""" """
print("START") # print("START")
self.extract(self.text)
def stop(self): def stop(self):
"""Ends the tagger. """Ends the tagger.
...@@ -81,7 +80,7 @@ class Tagger: ...@@ -81,7 +80,7 @@ class Tagger:
return [] return []
# Not used right now # Main function for extract()
def tag_text(self, text): def tag_text(self, text):
"""Send a text to be tagged. """Send a text to be tagged.
""" """
......
#!/bin/bash #!/bin/bash
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/srv/gargantext_lib/taggers/nlpserver/TurboParser/deps/local/lib:"
if [[ "$VIRTUAL_ENV" != "" ]]
then
source /srv/env_3-5/bin/activate
fi
export LD_LIBRARY_PATH=":/srv/gargantext_lib/taggers/nlpserver/TurboParser/deps/local/lib:" python3 /srv/gargantext/gargantext/util/taggers/lib/nlpserver/server.py
source /srv/env_3-5/bin/activate
python server.py
...@@ -11,7 +11,8 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor): ...@@ -11,7 +11,8 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
£TODO: load whole word dictionary in ram and check existence before inserting to db => sequential insert => probably faster! £TODO: load whole word dictionary in ram and check existence before inserting to db => sequential insert => probably faster!
""" """
print('INTEGRATE', len(ngrams_data), len(nodes_ngrams_count)) # print('INTEGRATE', len(ngrams_data), len(nodes_ngrams_count))
print('INTEGRATE')
# integrate ngrams (aka new words) # integrate ngrams (aka new words)
ngrams_ids = bulk_insert_ifnotexists( ngrams_ids = bulk_insert_ifnotexists(
model = Ngram, # todo type should :str ~~> :str|:re) !!! model = Ngram, # todo type should :str ~~> :str|:re) !!!
...@@ -118,7 +119,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND ...@@ -118,7 +119,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
# integrate ngrams and nodes-ngrams # integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE: if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
print(len(nodes_ngrams_count),">=", BATCH_NGRAMSEXTRACTION_SIZE) # print(len(nodes_ngrams_count),">=", BATCH_NGRAMSEXTRACTION_SIZE)
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor) _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear() nodes_ngrams_count.clear()
ngrams_data.clear() ngrams_data.clear()
......
...@@ -68,7 +68,8 @@ def add_lang(hyperdata, observed_languages, skipped_languages): ...@@ -68,7 +68,8 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
lang_result['skipped'].append(hyperdata["language_name"]) lang_result['skipped'].append(hyperdata["language_name"])
else: else:
print("[WARNING] no language_* found in document [parsing.py]") # print("WARNING no language_* found in document [parsing.py] => "
# + ("(detecting)" if DETECT_LANG else "(using default)"))
if DETECT_LANG: if DETECT_LANG:
#no language have been indexed #no language have been indexed
...@@ -93,7 +94,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages): ...@@ -93,7 +94,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
def parse(corpus): def parse(corpus):
try: try:
print("PARSING") print("PARSING")
print("DETECT_LANG?", DETECT_LANG) # print("DETECT_LANG?", DETECT_LANG)
corpus.status('Docs', progress=0) corpus.status('Docs', progress=0)
#1 corpus => 1 or multi resources.path (for crawlers) #1 corpus => 1 or multi resources.path (for crawlers)
resources = corpus.resources() resources = corpus.resources()
...@@ -107,7 +108,9 @@ def parse(corpus): ...@@ -107,7 +108,9 @@ def parse(corpus):
#corpus.status(error) #corpus.status(error)
raise ValueError("Resource '%s' has no Parser" %resource["name"]) raise ValueError("Resource '%s' has no Parser" %resource["name"])
parserbot = load_parser(source) parserbot = load_parser(source)
print(parserbot)
# print(parserbot)
#observed languages in default languages #observed languages in default languages
observed_languages = [] observed_languages = []
#skipped_languages #skipped_languages
...@@ -218,10 +221,10 @@ def parse(corpus): ...@@ -218,10 +221,10 @@ def parse(corpus):
#les jolis iso2 #les jolis iso2
observed_langs = dict(Counter(observed_languages)) observed_langs = dict(Counter(observed_languages))
print("#LANGAGES OK") # print("#LANGAGES OK")
print(observed_langs) # print(observed_langs)
print("#LANGUAGES UNKNOWN") # print("#LANGUAGES UNKNOWN")
print(skipped_langs) # print(skipped_langs)
top_langs = sorted(observed_langs.items(), key = lambda x: x[1], reverse=True) top_langs = sorted(observed_langs.items(), key = lambda x: x[1], reverse=True)
if len(top_langs) > 0: if len(top_langs) > 0:
......
...@@ -32,10 +32,15 @@ Tests ...@@ -32,10 +32,15 @@ Tests
5. ** tests ??? ** 5. ** tests ??? **
6. ** tests ??? ** 6. ** tests ??? **
7. **tests_070_routes** 7. **tests_070_routes**
Checks the response types from the app url routes: Checks the response types from the app url routes:
- "/" - "/"
- "/api/nodes" - "/api/nodes"
- "/api/nodes/<ID>" - "/api/nodes/<ID>"
8. ** tests users ??? **
9. **tests_090_toolchain**
Checks each data source parserbot (CSV, Pubmed, Zotero, Istex, etc.)
- correct parsing for a small sample
GargTestRunner GargTestRunner
...@@ -109,4 +114,3 @@ class MyTestRecipes(TestCase): ...@@ -109,4 +114,3 @@ class MyTestRecipes(TestCase):
self.assertEqual(the_response.status_code, 200) self.assertEqual(the_response.status_code, 200)
``` ```
*Si vous aimez les aventures de Peter Corser, lisez l'album précédent ["Doors"](https://gogs.iscpif.fr/leclaire/doors)* (Scénario M. Leclaire, Dessins R. Loth) (disponible dans toutes les bonnes librairies)
...@@ -32,7 +32,7 @@ class ToolChainRecipes(TestCase): ...@@ -32,7 +32,7 @@ class ToolChainRecipes(TestCase):
def setUp(self): def setUp(self):
#self.session = GargTestRunner.testdb_session #self.session = GargTestRunner.testdb_session
self.session = session self.session = session
self.log= logging.getLogger( "SomeTest.testSomething" ) self.log= logging.getLogger( "unitests.test_090_toolchain" )
self.client = Client() self.client = Client()
self.user = User() self.user = User()
self.project = self._create_project() self.project = self._create_project()
...@@ -40,34 +40,27 @@ class ToolChainRecipes(TestCase): ...@@ -40,34 +40,27 @@ class ToolChainRecipes(TestCase):
self.source_list.insert(0, (0,"Select a database below")) self.source_list.insert(0, (0,"Select a database below"))
self.sample_files = self._collect_samples_files() self.sample_files = self._collect_samples_files()
def tearDown(self):
#del self.session
del self.client
#del self.factory
del self.source_list
del self.sample_files
del self.project
def _create_project(self): def _create_project(self):
self.project = Node( project = Node(
user_id = self.user.id, user_id = self.user.id,
typename = 'PROJECT', typename = 'PROJECT',
name = "test1000", name = "test1000",
) )
self.session.add(self.project) self.session.add(project)
self.session.commit() self.session.commit()
return self.project return project
def __count_node_children__(self, CurrNode, typename=None): def __count_node_children__(self, CurrNode, typename=None):
'''find ALL the children of a given Node [optionnal filter TYPENAME] ''' '''count ALL the children of a given Node [optional filter TYPENAME] '''
if typename is None: if typename is None:
self.children = CurrNode.children('', order=True).count() children = CurrNode.children('').count()
else: else:
self.children = CurrNode.children(typename, order=True).count() children = CurrNode.children(typename).count()
return self.children return children
def __find_node_parent__(self, CurrNode): def __find_node_parent__(self, CurrNode):
'''find the parent Node given a CurrNode ''' '''find the parent Node given a CurrNode '''
self.parent = self.session.query(Node).filter(Node.id == Node.parent_id, Node.name == name).first() self.parent = self.session.query(Node).filter(Node.id == CurrNode.parent_id).first()
def _collect_samples_files(self): def _collect_samples_files(self):
from collections import defaultdict from collections import defaultdict
...@@ -79,22 +72,24 @@ class ToolChainRecipes(TestCase): ...@@ -79,22 +72,24 @@ class ToolChainRecipes(TestCase):
for format_source in os.listdir(DATA_SAMPLE_DIR): for format_source in os.listdir(DATA_SAMPLE_DIR):
#self.log.debug(format_source) #self.log.debug(format_source)
full_path = join(DATA_SAMPLE_DIR, format_source) full_path = join(DATA_SAMPLE_DIR, format_source)
if not os.path.isfile(full_path): if not isfile(full_path):
if format_source in sources: if format_source in sources:
self.sample_files[format_source] = [join(full_path, samplef) for samplef in os.listdir(full_path)] self.sample_files[format_source] = [join(full_path, samplef) for samplef in os.listdir(full_path)]
return self.sample_files return self.sample_files
def _create_corpus(self,name, source_type, sample_file): def _create_corpus(self,name, source_type, sample_file):
self.corpus = self.project.add_child( corpus = self.project.add_child(
name = name, name = name,
typename = 'CORPUS', typename = 'CORPUS',
) )
self.corpus.add_resource( corpus.add_resource(
type = int(source_type), type = int(source_type),
path = sample_file, path = sample_file,
) )
self.session.add(self.corpus) self.session.add(corpus)
self.session.commit() self.session.commit()
return self.corpus return corpus
def _get_corpus(self, name): def _get_corpus(self, name):
corpus = self.session.query(Node).filter(Node.typename == "CORPUS", Node.name == name).first() corpus = self.session.query(Node).filter(Node.typename == "CORPUS", Node.name == name).first()
return corpus return corpus
...@@ -104,7 +99,7 @@ class ToolChainRecipes(TestCase): ...@@ -104,7 +99,7 @@ class ToolChainRecipes(TestCase):
Each of the resources input test can follow this common recipe base Each of the resources input test can follow this common recipe base
@param source_type: int (cf. constants.py RESOURCETYPES) @param source_type: int (cf. constants.py RESOURCETYPES)
@param expected_results: []int (number of docs for each sample corpora of this source) @param expected_results: int[] (number of docs for each sample corpora of this source)
""" """
source = get_resource(source_type) source = get_resource(source_type)
source_name = source["name"].split("[")[0].lower().strip().replace(" ", "_") source_name = source["name"].split("[")[0].lower().strip().replace(" ", "_")
...@@ -168,8 +163,3 @@ class ToolChainRecipes(TestCase): ...@@ -168,8 +163,3 @@ class ToolChainRecipes(TestCase):
def tests_010(self): def tests_010(self):
self._run_recipe(10, DATA_SAMPLE_NDOCS[10]) self._run_recipe(10, DATA_SAMPLE_NDOCS[10])
if __name__ == "__main__":
logging.basicConfig( stream=sys.stderr )
logging.getLogger( "unitests.test_090_toolchain" ).setLevel( logging.DEBUG )
unittest.main()
"""
API UNIT TESTS
================
"""
from django.test import TestCase, Client
from gargantext.models import Node
from gargantext.util.db import session
from rest_framework.test import APIClient
from rest_framework.test import APIRequestFactory
# Using the standard RequestFactory API to create a form POST request
#factory = APIRequestFactory()
class APIRecipe(TestCase):
def setUp(self):
"""
Will be run before each test
"""
self.client = Client()
# login with our fake user
response = self.client.post(
'/auth/login/',
{'username': 'pcorser', 'password': 'peter'}
)
self.create_project()
self.create_corpus()
self.factory = APIRequestFactory()
def create_project(self):
new_project = Node(
typename = 'PROJECT',
name = "My project",
)
session.add(new_project)
session.commit()
self.project = new_project
def create_corpus(self):
#create a default corpus
self.corpus = self.project.add_child(
name = "My Corpus",
typename = 'CORPUS',
)
session.add(self.corpus)
session.commit()
def test_001_post_project(self):
'''POST /projects'''
request = self.factory.post('/api/projects/', {'name': 'PROJECT TEST'}, format='json')
def test_002_get_projects(self):
'''GET /projects'''
request = self.factory.get('/api/projects/', format='json')
def test_003_put_projects(self):
'''PUT /projects'''
request = self.factory.put('/api/projects/', {"name": "My TEST PROJECT"}, format='json')
def test_004_delete_projects(self):
'''DELETE /projects'''
request = self.factory.delete('/api/projects/', format='json')
def test_005_delete_project(self):
'''DELETE /project'''
request = self.factory.delete('/api/project/%s' %self.project.id, format='json')
def test_006_get_project(self):
'''GET /PROJECT'''
request = self.factory.get('/api/project/%s' %self.project.id, format='json')
def test_007_put_project(self):
''' PUT /PROJECT '''
request = self.factory.put('/api/project/%s' %self.project.id, {"name": "My New Project"}, format='json')
# def test_008_post_corpus(self):
# '''POST /project'''
# request = self.factory.post('/project/', {'name': 'PROJECT TEST'})
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment