Prepare a method for checking content file = is it the right parser?

5866a043 · c24b · 3374d428 · 5866a043 · 5866a043 · 5866a043
Commit 5866a043 authored May 11, 2016 by c24b
Showing with 21 additions and 12 deletions

parser.md docs/overview/parser.md +14 -11

constants.py gargantext/constants.py +2 -1

files.py gargantext/util/files.py +2 -0

_Parser.py gargantext/util/parsers/_Parser.py +3 -0

No files found.
--- a/docs/overview/parser.md
+++ b/docs/overview/parser.md
@@ -50,35 +50,38 @@ exposé dans `/templates/pages/projects/project.html`
 ## reference your parser script
-## add your script into gargantext/util/
+## add your parser script into folder gargantext/util/parser/
-here filename is Cern.py
+here my filename was Cern.py
 ##declare it into gargantext/util/parser/__init__.py
 from .Cern  import CernParser
+At this step, you will be able to see your parser and add a file with the form
+but nothing will occur
-## add your parser script into gargantext/util/parser/
+## the good way to write the scrapper script
-At this step, you will be able to see your parser and add a file with the form
+Three main and only requirements:
-it will send the job to toolchain
+* your parser class should inherit from the base class _Parser()
-##
+* your parser class must have a parse method that take a **filename** as input
-parse_extract_indexhyperdata(corpus)
+* you parser must structure and store data into **hyperdata_list** variable name
+to be properly indexed by toolchain
+# Adding a scrapper API to offer search option:
 * Add pop up question Do you have a corpus
 option search in /templates/pages/projects/project.html line 181
-adding
 # Some changes
 * adding accepted_formats in constants
-* adding check_file routine in Form check
+* adding check_file routine in Form check ==> but should inherit from utils/files.py
+that also have implmented the size upload limit check
 # Suggestion next step:
 * XML parser MARC21 UNIMARC ...
 * A project type is qualified by the first element add i.e:
 the first element determine the type of corpus of all the corpora within the project
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -246,7 +246,8 @@ from .settings import BASE_DIR
 # uploads/.gitignore prevents corpora indexing
 # copora can be either a folder or symlink towards specific partition
 UPLOAD_DIRECTORY   = os.path.join(BASE_DIR, 'uploads/corpora')
-UPLOAD_LIMIT       = 1024 * 1024 * 1024
+UPLOAD_LIMIT       = 1024
+#* 1024 * 1024
 DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY

--- a/gargantext/util/files.py
+++ b/gargantext/util/files.py
@@ -25,11 +25,13 @@ def download(url, name=''):
 def upload(uploaded):
+    print(repr(uploaded))
    if uploaded.size > UPLOAD_LIMIT:
        raise IOError('Uploaded file is bigger than allowed: %d > %d' % (
            uploaded.size,
            UPLOAD_LIMIT,
        ))
    return save(
        contents = uploaded.file.read(),
        name = uploaded.name,

--- a/gargantext/util/parsers/_Parser.py
+++ b/gargantext/util/parsers/_Parser.py
@@ -23,6 +23,9 @@ class Parser:
    def __del__(self):
        self._file.close()
+    def detect_format(self, accepted_format):
+        print(self._file[:1000])
    def detect_encoding(self, string):
        """Useful method to detect the encoding of a document.
        """