hal.py 3.77 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** HAL Crawler *****
# ****************************
# LICENCE: GARGANTEXT.org Licence

RESOURCE_TYPE_HAL = 11

from django.shortcuts               import redirect, render
from django.http                    import Http404, HttpResponseRedirect \
                                                  , HttpResponseForbidden

from gargantext.constants           import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes        import Node
from gargantext.util.db             import session
from gargantext.util.db_cache       import cache
from gargantext.util.http           import JsonHttpResponse
from gargantext.util.scheduling     import scheduled
from gargantext.util.toolchain      import parse_extract_indexhyperdata


def query( request):
    '''get GlobalResults()'''
    if request.method == "POST":
        query = request.POST["query"]
        source = get_resource(RESOURCE_TYPE_HAL)
        if source["crawler"] is not None:
            crawlerbot = load_crawler(source)()
            #old raw way to get results_nb
            results = crawlerbot.scan_results(query)
            #ids = crawlerbot.get_ids(query)
            print(results)
            return JsonHttpResponse({"results_nb":crawlerbot.results_nb})

36
def save(request, project_id, return_corpus=False):
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
    '''save'''
    if request.method == "POST":

        query = request.POST.get("query")
        try:
            N = int(request.POST.get("N"))
        except:
            N = 0
        print(query, N)
        #for next time
        #ids = request.POST["ids"]
        source = get_resource(RESOURCE_TYPE_HAL)
        if N == 0:
            raise Http404()
        if N > QUERY_SIZE_N_MAX:
            N = QUERY_SIZE_N_MAX

        try:
            project_id = int(project_id)
        except ValueError:
            raise Http404()
        # do we have a valid project?
        project = session.query( Node ).filter(Node.id == project_id).first()
        if project is None:
            raise Http404()
        user = cache.User[request.user.id]
        if not user.owns(project):
            return HttpResponseForbidden()
        # corpus node instanciation as a Django model

        corpus = Node(
            name = query,
            user_id = request.user.id,
            parent_id = project_id,
            typename = 'CORPUS',
                        hyperdata    = { "action"        : "Scrapping data"
                                        }
        )

        #download_file
        crawler_bot = load_crawler(source)()
        #for now no way to force downloading X records

        #the long running command
        filename = crawler_bot.download(query)
        corpus.add_resource(
           type = source["type"]
        #,  name = source["name"]
        ,  path = crawler_bot.path
                           )

        session.add(corpus)
        session.commit()
        #corpus_id = corpus.id

        try:
            scheduled(parse_extract_indexhyperdata)(corpus.id)
        except Exception as error:
            print('WORKFLOW ERROR')
            print(error)
            try:
                print_tb(error.__traceback__)
            except:
                pass
            # IMPORTANT ---------------------------------
            # sanitize session after interrupted transact
            session.rollback()
            # --------------------------------------------

106 107 108
        if return_corpus:
            return corpus

109 110 111 112 113 114 115 116 117 118 119 120 121 122
        return render(
            template_name = 'pages/projects/wait.html',
            request = request,
            context = {
                'user'   : request.user,
                'project': project,
            },
        )


    data = [query_string,query,N]
    print(data)
    return JsonHttpResponse(data)