[hal] rewrite crawler, some small fixes to our code

parent b71d4a5f
Pipeline #5783 canceled with stages
...@@ -18,7 +18,7 @@ fi ...@@ -18,7 +18,7 @@ fi
# with the `sha256sum` result calculated on the `cabal.project` and # with the `sha256sum` result calculated on the `cabal.project` and
# `cabal.project.freeze`. This ensures the files stay deterministic so that CI # `cabal.project.freeze`. This ensures the files stay deterministic so that CI
# cache can kick in. # cache can kick in.
expected_cabal_project_hash="1cbb47fd3f929a01b3b968cc2e148dcbf5ef4e662e14ed9832d32471a68f6766" expected_cabal_project_hash="3bfa2552464823ff4f1d892e9dc2778a9cbf1a153a6639ec9caf87e6d9c75a7b"
expected_cabal_project_freeze_hash="2c8960ffcf1b94aa11a3543e3b5facd2db5af19569fecaec4bc0ab4c1edd22a5" expected_cabal_project_freeze_hash="2c8960ffcf1b94aa11a3543e3b5facd2db5af19569fecaec4bc0ab4c1edd22a5"
cabal --store-dir=$STORE_DIR v2-build --dry-run cabal --store-dir=$STORE_DIR v2-build --dry-run
......
...@@ -106,7 +106,7 @@ source-repository-package ...@@ -106,7 +106,7 @@ source-repository-package
source-repository-package source-repository-package
type: git type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/hal.git location: https://gitlab.iscpif.fr/gargantext/crawlers/hal.git
tag: bfa9069b4ff70f341ca3244e8aff9e83eb4b8b73 tag: b99b9e568c8bdc73af2b8016ed03ba5ee83c2030
source-repository-package source-repository-package
type: git type: git
......
...@@ -18,8 +18,7 @@ module Gargantext.Core.Text.Corpus.API ...@@ -18,8 +18,7 @@ module Gargantext.Core.Text.Corpus.API
, externalAPIs , externalAPIs
) where ) where
import Conduit import Conduit ( ConduitT, yieldMany )
import Control.Monad.Except
import Data.Text qualified as T import Data.Text qualified as T
import EPO.API.Client.Types qualified as EPO import EPO.API.Client.Types qualified as EPO
import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs) import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs)
...@@ -32,7 +31,7 @@ import Gargantext.Core.Text.Corpus.API.Istex qualified as ISTEX ...@@ -32,7 +31,7 @@ import Gargantext.Core.Text.Corpus.API.Istex qualified as ISTEX
import Gargantext.Core.Text.Corpus.API.OpenAlex qualified as OpenAlex import Gargantext.Core.Text.Corpus.API.OpenAlex qualified as OpenAlex
import Gargantext.Core.Text.Corpus.API.Pubmed qualified as PUBMED import Gargantext.Core.Text.Corpus.API.Pubmed qualified as PUBMED
import Gargantext.Core.Text.Corpus.Query qualified as Corpus import Gargantext.Core.Text.Corpus.Query qualified as Corpus
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Prelude hiding (get) import Gargantext.Prelude hiding (get)
import PUBMED.Types qualified as PUBMED import PUBMED.Types qualified as PUBMED
import Servant.Client (ClientError) import Servant.Client (ClientError)
...@@ -80,3 +79,5 @@ get externalAPI lang q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do ...@@ -80,3 +79,5 @@ get externalAPI lang q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do
first ExternalAPIError <$> EPO.get epoAuthKey epoAPIUrl q (toISO639 lang) limit first ExternalAPIError <$> EPO.get epoAuthKey epoAPIUrl q (toISO639 lang) limit
where where
parse_query = first (InvalidInputQuery q . T.pack) $ Corpus.parseQuery q parse_query = first (InvalidInputQuery q . T.pack) $ Corpus.parseQuery q
...@@ -12,29 +12,27 @@ Portability : POSIX ...@@ -12,29 +12,27 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API.Hal module Gargantext.Core.Text.Corpus.API.Hal
where where
import Conduit import Conduit ( (.|), ConduitT, mapMC )
import Data.Either
import Data.LanguageCodes qualified as ISO639 import Data.LanguageCodes qualified as ISO639
import Data.Map.Strict qualified as Map import Data.Map.Strict qualified as Map
import Data.Maybe import Data.Text (pack)
import Data.Text (pack, intercalate)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Defaults qualified as Defaults import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (intercalate) import Gargantext.Prelude hiding (intercalate)
import HAL qualified as HAL import HAL qualified
import HAL.Client qualified as HAL
import HAL.Doc.Corpus qualified as HAL import HAL.Doc.Corpus qualified as HAL
import HAL.Types qualified as HAL
import Servant.Client (ClientError) import Servant.Client (ClientError)
get :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO [HyperdataDocument] get :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO [HyperdataDocument]
get la q ml = do get la q ml = do
eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) la eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) la
either (panicTrace . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) eDocs either (panicTrace . pack . show) (mapM (toDoc' la) . HAL._docs) eDocs
getC :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ())) getC :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do getC la q ml = do
eRes <- HAL.getMetadataWithC [q] (Just 0) (fromIntegral <$> ml) la eRes <- HAL.getMetadataWithCursorC q (fromIntegral <$> ml) la
pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes
-- case eRes of -- case eRes of
-- Left err -> panic $ pack $ show err -- Left err -> panic $ pack $ show err
...@@ -43,21 +41,21 @@ getC la q ml = do ...@@ -43,21 +41,21 @@ getC la q ml = do
toDoc' :: Maybe ISO639.ISO639_1 -> HAL.Corpus -> IO HyperdataDocument toDoc' :: Maybe ISO639.ISO639_1 -> HAL.Corpus -> IO HyperdataDocument
toDoc' la (HAL.Corpus { .. }) = do toDoc' la (HAL.Corpus { .. }) = do
-- printDebug "[toDoc corpus] h" h -- printDebug "[toDoc corpus] h" h
let mDateS = maybe (Just $ pack $ show Defaults.year) Just _corpus_date let mDateS = _corpus_date <|> Just (pack $ show Defaults.year)
let (utctime, (pub_year, pub_month, pub_day)) = Date.mDateSplit mDateS let (utctime, (pub_year, pub_month, pub_day)) = Date.mDateSplit mDateS
let abstractDefault = intercalate " " _corpus_abstract let abstractDefault = unwords _corpus_abstract
let abstract = case la of let abstract = case la of
Nothing -> abstractDefault Nothing -> abstractDefault
Just l -> fromMaybe abstractDefault (intercalate " " <$> Map.lookup l _corpus_abstract_lang_map) Just l -> maybe abstractDefault unwords (Map.lookup l _corpus_abstract_lang_map)
pure HyperdataDocument { _hd_bdd = Just "Hal" pure HyperdataDocument { _hd_bdd = Just "Hal"
, _hd_doi = Just $ pack $ show _corpus_docid , _hd_doi = Just $ pack $ show _corpus_docid
, _hd_url = Nothing , _hd_url = Nothing
, _hd_uniqId = Nothing , _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing , _hd_uniqIdBdd = Nothing
, _hd_page = Nothing , _hd_page = Nothing
, _hd_title = Just $ intercalate " " _corpus_title , _hd_title = Just $ unwords _corpus_title
, _hd_authors = Just $ foldl (\x y -> if x == "" then y else x <> ", " <> y) "" _corpus_authors_names , _hd_authors = Just $ foldl' (\x y -> if x == "" then y else x <> ", " <> y) "" _corpus_authors_names
, _hd_institutes = Just $ foldl (\x y -> if x == "" then y else x <> ", " <> y) "" $ _corpus_authors_affiliations <> map show _corpus_struct_id , _hd_institutes = Just $ foldl' (\x y -> if x == "" then y else x <> ", " <> y) "" $ _corpus_authors_affiliations <> map show _corpus_struct_id
, _hd_source = Just $ maybe "Nothing" identity _corpus_source , _hd_source = Just $ maybe "Nothing" identity _corpus_source
, _hd_abstract = Just abstract , _hd_abstract = Just abstract
, _hd_publication_date = fmap show utctime , _hd_publication_date = fmap show utctime
......
...@@ -20,9 +20,9 @@ import Data.List qualified as List ...@@ -20,9 +20,9 @@ import Data.List qualified as List
import Data.Text qualified as Text import Data.Text qualified as Text
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex (toDoc) import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex (toDoc)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Prelude hiding (get) import Gargantext.Prelude hiding (get)
import ISTEX qualified as ISTEX import ISTEX qualified
import ISTEX.Client qualified as ISTEX import ISTEX.Client qualified as ISTEX
type Query = Text type Query = Text
...@@ -40,14 +40,14 @@ get la query' maxResults = do ...@@ -40,14 +40,14 @@ get la query' maxResults = do
-- eDocs <- ISTEX.getMetadataScroll (q <> " abstract:*") "1m" Nothing 0 --(fromIntegral <$> ml) -- eDocs <- ISTEX.getMetadataScroll (q <> " abstract:*") "1m" Nothing 0 --(fromIntegral <$> ml)
-- eDocs <- ISTEX.getMetadataScroll q "1m" Nothing 0 --(fromIntegral <$> ml) -- eDocs <- ISTEX.getMetadataScroll q "1m" Nothing 0 --(fromIntegral <$> ml)
let query = case (List.length $ Text.splitOn ":" query') == 1 of let query = if List.length (Text.splitOn ":" query') == 1 then
-- True case means users is entering default search of IsTex -- True case means users is entering default search of IsTex
-- In that case we need to enrich his query with 2 parameters -- In that case we need to enrich his query with 2 parameters
-- First expected language: user has to define it in GTXT -- First expected language: user has to define it in GTXT
-- Second : query in abstract -- Second : query in abstract
True -> ("language:"<> toISTEXLanguageCode la) <> " AND abstract:"<>query' ("language:"<> toISTEXLanguageCode la) <> " AND abstract:"<>query'
False -> query' else query'
-- Complex queries of IsTex needs parameters using ":" so we leave the query as it is -- Complex queries of IsTex needs parameters using ":" so we leave the query as it is
-- in that case we suppose user is knowing what s.he is doing -- in that case we suppose user is knowing what s.he is doing
......
...@@ -135,7 +135,7 @@ ...@@ -135,7 +135,7 @@
git: "https://gitlab.iscpif.fr/gargantext/crawlers/epo-proxy-api.git" git: "https://gitlab.iscpif.fr/gargantext/crawlers/epo-proxy-api.git"
subdirs: subdirs:
- . - .
- commit: bfa9069b4ff70f341ca3244e8aff9e83eb4b8b73 - commit: b99b9e568c8bdc73af2b8016ed03ba5ee83c2030
git: "https://gitlab.iscpif.fr/gargantext/crawlers/hal.git" git: "https://gitlab.iscpif.fr/gargantext/crawlers/hal.git"
subdirs: subdirs:
- . - .
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment