Commit 39ea62eb authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/327-dev-rewrite-hal-crawler' into dev

parents 25a14cb2 629d7af7
...@@ -18,7 +18,7 @@ fi ...@@ -18,7 +18,7 @@ fi
# with the `sha256sum` result calculated on the `cabal.project` and # with the `sha256sum` result calculated on the `cabal.project` and
# `cabal.project.freeze`. This ensures the files stay deterministic so that CI # `cabal.project.freeze`. This ensures the files stay deterministic so that CI
# cache can kick in. # cache can kick in.
expected_cabal_project_hash="1cbb47fd3f929a01b3b968cc2e148dcbf5ef4e662e14ed9832d32471a68f6766" expected_cabal_project_hash="3bfa2552464823ff4f1d892e9dc2778a9cbf1a153a6639ec9caf87e6d9c75a7b"
expected_cabal_project_freeze_hash="2c8960ffcf1b94aa11a3543e3b5facd2db5af19569fecaec4bc0ab4c1edd22a5" expected_cabal_project_freeze_hash="2c8960ffcf1b94aa11a3543e3b5facd2db5af19569fecaec4bc0ab4c1edd22a5"
cabal --store-dir=$STORE_DIR v2-build --dry-run cabal --store-dir=$STORE_DIR v2-build --dry-run
......
...@@ -106,7 +106,7 @@ source-repository-package ...@@ -106,7 +106,7 @@ source-repository-package
source-repository-package source-repository-package
type: git type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/hal.git location: https://gitlab.iscpif.fr/gargantext/crawlers/hal.git
tag: bfa9069b4ff70f341ca3244e8aff9e83eb4b8b73 tag: b99b9e568c8bdc73af2b8016ed03ba5ee83c2030
source-repository-package source-repository-package
type: git type: git
......
...@@ -18,8 +18,7 @@ module Gargantext.Core.Text.Corpus.API ...@@ -18,8 +18,7 @@ module Gargantext.Core.Text.Corpus.API
, externalAPIs , externalAPIs
) where ) where
import Conduit import Conduit ( ConduitT, yieldMany )
import Control.Monad.Except
import Data.Text qualified as T import Data.Text qualified as T
import EPO.API.Client.Types qualified as EPO import EPO.API.Client.Types qualified as EPO
import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs) import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs)
...@@ -32,7 +31,7 @@ import Gargantext.Core.Text.Corpus.API.Istex qualified as ISTEX ...@@ -32,7 +31,7 @@ import Gargantext.Core.Text.Corpus.API.Istex qualified as ISTEX
import Gargantext.Core.Text.Corpus.API.OpenAlex qualified as OpenAlex import Gargantext.Core.Text.Corpus.API.OpenAlex qualified as OpenAlex
import Gargantext.Core.Text.Corpus.API.Pubmed qualified as PUBMED import Gargantext.Core.Text.Corpus.API.Pubmed qualified as PUBMED
import Gargantext.Core.Text.Corpus.Query qualified as Corpus import Gargantext.Core.Text.Corpus.Query qualified as Corpus
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Prelude hiding (get) import Gargantext.Prelude hiding (get)
import PUBMED.Types qualified as PUBMED import PUBMED.Types qualified as PUBMED
import Servant.Client (ClientError) import Servant.Client (ClientError)
...@@ -80,3 +79,5 @@ get externalAPI lang q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do ...@@ -80,3 +79,5 @@ get externalAPI lang q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do
first ExternalAPIError <$> EPO.get epoAuthKey epoAPIUrl q (toISO639 lang) limit first ExternalAPIError <$> EPO.get epoAuthKey epoAPIUrl q (toISO639 lang) limit
where where
parse_query = first (InvalidInputQuery q . T.pack) $ Corpus.parseQuery q parse_query = first (InvalidInputQuery q . T.pack) $ Corpus.parseQuery q
...@@ -20,9 +20,9 @@ import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date ...@@ -20,9 +20,9 @@ import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Defaults qualified as Defaults import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (intercalate) import Gargantext.Prelude hiding (intercalate)
import HAL qualified as HAL import HAL qualified
import HAL.Client qualified as HAL
import HAL.Doc.Corpus qualified as HAL import HAL.Doc.Corpus qualified as HAL
import HAL.Types qualified as HAL
import Servant.Client (ClientError) import Servant.Client (ClientError)
get :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO [HyperdataDocument] get :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO [HyperdataDocument]
...@@ -32,7 +32,7 @@ get la q ml = do ...@@ -32,7 +32,7 @@ get la q ml = do
getC :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ())) getC :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do getC la q ml = do
eRes <- HAL.getMetadataWithC [q] (Just 0) (fromIntegral <$> ml) la eRes <- HAL.getMetadataWithCursorC q (fromIntegral <$> ml) la
pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes
-- case eRes of -- case eRes of
-- Left err -> panic $ pack $ show err -- Left err -> panic $ pack $ show err
...@@ -41,7 +41,7 @@ getC la q ml = do ...@@ -41,7 +41,7 @@ getC la q ml = do
toDoc' :: Maybe ISO639.ISO639_1 -> HAL.Corpus -> IO HyperdataDocument toDoc' :: Maybe ISO639.ISO639_1 -> HAL.Corpus -> IO HyperdataDocument
toDoc' la (HAL.Corpus { .. }) = do toDoc' la (HAL.Corpus { .. }) = do
-- printDebug "[toDoc corpus] h" h -- printDebug "[toDoc corpus] h" h
let mDateS = maybe (Just $ pack $ show Defaults.year) Just _corpus_date let mDateS = _corpus_date <|> Just (pack $ show Defaults.year)
let (utctime, (pub_year, pub_month, pub_day)) = Date.mDateSplit mDateS let (utctime, (pub_year, pub_month, pub_day)) = Date.mDateSplit mDateS
let abstractDefault = unwords _corpus_abstract let abstractDefault = unwords _corpus_abstract
let abstract = case la of let abstract = case la of
...@@ -52,8 +52,8 @@ toDoc' la (HAL.Corpus { .. }) = do ...@@ -52,8 +52,8 @@ toDoc' la (HAL.Corpus { .. }) = do
, _hd_url = Nothing , _hd_url = Nothing
, _hd_page = Nothing , _hd_page = Nothing
, _hd_title = Just $ unwords _corpus_title , _hd_title = Just $ unwords _corpus_title
, _hd_authors = Just $ foldl (\x y -> if x == "" then y else x <> ", " <> y) "" _corpus_authors_names , _hd_authors = Just $ foldl' (\x y -> if x == "" then y else x <> ", " <> y) "" _corpus_authors_names
, _hd_institutes = Just $ foldl (\x y -> if x == "" then y else x <> ", " <> y) "" $ _corpus_authors_affiliations <> map show _corpus_struct_id , _hd_institutes = Just $ foldl' (\x y -> if x == "" then y else x <> ", " <> y) "" $ _corpus_authors_affiliations <> map show _corpus_struct_id
, _hd_source = Just $ maybe "Nothing" identity _corpus_source , _hd_source = Just $ maybe "Nothing" identity _corpus_source
, _hd_abstract = Just abstract , _hd_abstract = Just abstract
, _hd_publication_date = fmap show utctime , _hd_publication_date = fmap show utctime
......
...@@ -20,9 +20,9 @@ import Data.List qualified as List ...@@ -20,9 +20,9 @@ import Data.List qualified as List
import Data.Text qualified as Text import Data.Text qualified as Text
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex (toDoc) import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex (toDoc)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Prelude hiding (get) import Gargantext.Prelude hiding (get)
import ISTEX qualified as ISTEX import ISTEX qualified
import ISTEX.Client qualified as ISTEX import ISTEX.Client qualified as ISTEX
type Query = Text type Query = Text
...@@ -40,14 +40,14 @@ get la query' maxResults = do ...@@ -40,14 +40,14 @@ get la query' maxResults = do
-- eDocs <- ISTEX.getMetadataScroll (q <> " abstract:*") "1m" Nothing 0 --(fromIntegral <$> ml) -- eDocs <- ISTEX.getMetadataScroll (q <> " abstract:*") "1m" Nothing 0 --(fromIntegral <$> ml)
-- eDocs <- ISTEX.getMetadataScroll q "1m" Nothing 0 --(fromIntegral <$> ml) -- eDocs <- ISTEX.getMetadataScroll q "1m" Nothing 0 --(fromIntegral <$> ml)
let query = case (List.length $ Text.splitOn ":" query') == 1 of let query = if List.length (Text.splitOn ":" query') == 1 then
-- True case means users is entering default search of IsTex -- True case means users is entering default search of IsTex
-- In that case we need to enrich his query with 2 parameters -- In that case we need to enrich his query with 2 parameters
-- First expected language: user has to define it in GTXT -- First expected language: user has to define it in GTXT
-- Second : query in abstract -- Second : query in abstract
True -> ("language:"<> toISTEXLanguageCode la) <> " AND abstract:"<>query' ("language:"<> toISTEXLanguageCode la) <> " AND abstract:"<>query'
False -> query' else query'
-- Complex queries of IsTex needs parameters using ":" so we leave the query as it is -- Complex queries of IsTex needs parameters using ":" so we leave the query as it is
-- in that case we suppose user is knowing what s.he is doing -- in that case we suppose user is knowing what s.he is doing
......
...@@ -135,7 +135,7 @@ ...@@ -135,7 +135,7 @@
git: "https://gitlab.iscpif.fr/gargantext/crawlers/epo-proxy-api.git" git: "https://gitlab.iscpif.fr/gargantext/crawlers/epo-proxy-api.git"
subdirs: subdirs:
- . - .
- commit: bfa9069b4ff70f341ca3244e8aff9e83eb4b8b73 - commit: b99b9e568c8bdc73af2b8016ed03ba5ee83c2030
git: "https://gitlab.iscpif.fr/gargantext/crawlers/hal.git" git: "https://gitlab.iscpif.fr/gargantext/crawlers/hal.git"
subdirs: subdirs:
- . - .
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment