Prepare a method for checking content file = is it the right parser?

5866a043 · c24b · 3374d428 · 5866a043 · 5866a043 · 5866a043
Commit 5866a043 authored May 11, 2016 by c24b
Showing with 21 additions and 12 deletions

parser.md docs/overview/parser.md +14 -11

constants.py gargantext/constants.py +2 -1

files.py gargantext/util/files.py +2 -0

_Parser.py gargantext/util/parsers/_Parser.py +3 -0

No files found.
--- a/docs/overview/parser.md
+++ b/docs/overview/parser.md
@@ -50,35 +50,38 @@ exposé dans `/templates/pages/projects/project.html`

 ## reference your parser script

-## add your script into gargantext/util/
-here filename is Cern.py
+## add your parser script into folder gargantext/util/parser/
+here my filename was Cern.py

 ##declare it into gargantext/util/parser/__init__.py
 from .Cern  import CernParser

+At this step, you will be able to see your parser and add a file with the form
+but nothing will occur

-## add your parser script into gargantext/util/parser/
+## the good way to write the scrapper script

-At this step, you will be able to see your parser and add a file with the form
-it will send the job to toolchain
-##
-parse_extract_indexhyperdata(corpus)
+Three main and only requirements:
+* your parser class should inherit from the base class _Parser()
+* your parser class must have a parse method that take a **filename** as input
+* you parser must structure and store data into **hyperdata_list** variable name
+to be properly indexed by toolchain

+# Adding a scrapper API to offer search option:
 * Add pop up question Do you have a corpus
 option search in /templates/pages/projects/project.html line 181


-adding
+

 # Some changes
 * adding accepted_formats in constants
-* adding check_file routine in Form check
+* adding check_file routine in Form check ==> but should inherit from utils/files.py
+that also have implmented the size upload limit check

 # Suggestion next step:

-
 * XML parser MARC21 UNIMARC ...
-
 * A project type is qualified by the first element add i.e:
 the first element determine the type of corpus of all the corpora within the project

--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -246,7 +246,8 @@ from .settings import BASE_DIR
 # uploads/.gitignore prevents corpora indexing
 # copora can be either a folder or symlink towards specific partition
 UPLOAD_DIRECTORY   = os.path.join(BASE_DIR, 'uploads/corpora')
-UPLOAD_LIMIT       = 1024 * 1024 * 1024
+UPLOAD_LIMIT       = 1024
+#* 1024 * 1024
 DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY



--- a/gargantext/util/files.py
+++ b/gargantext/util/files.py
@@ -25,11 +25,13 @@ def download(url, name=''):


 def upload(uploaded):
+    print(repr(uploaded))
    if uploaded.size > UPLOAD_LIMIT:
        raise IOError('Uploaded file is bigger than allowed: %d > %d' % (
            uploaded.size,
            UPLOAD_LIMIT,
        ))
+
    return save(
        contents = uploaded.file.read(),
        name = uploaded.name,

--- a/gargantext/util/parsers/_Parser.py
+++ b/gargantext/util/parsers/_Parser.py
@@ -23,6 +23,9 @@ class Parser:
    def __del__(self):
        self._file.close()

+    def detect_format(self, accepted_format):
+        print(self._file[:1000])
+
    def detect_encoding(self, string):
        """Useful method to detect the encoding of a document.
        """