[hal] rewrite crawler, some small fixes to our code

parent b71d4a5f
Pipeline #5783 canceled with stages
......@@ -18,7 +18,7 @@ fi
# with the `sha256sum` result calculated on the `cabal.project` and
# `cabal.project.freeze`. This ensures the files stay deterministic so that CI
# cache can kick in.
expected_cabal_project_hash="1cbb47fd3f929a01b3b968cc2e148dcbf5ef4e662e14ed9832d32471a68f6766"
expected_cabal_project_hash="3bfa2552464823ff4f1d892e9dc2778a9cbf1a153a6639ec9caf87e6d9c75a7b"
expected_cabal_project_freeze_hash="2c8960ffcf1b94aa11a3543e3b5facd2db5af19569fecaec4bc0ab4c1edd22a5"
cabal --store-dir=$STORE_DIR v2-build --dry-run
......
......@@ -106,7 +106,7 @@ source-repository-package
source-repository-package
type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/hal.git
tag: bfa9069b4ff70f341ca3244e8aff9e83eb4b8b73
tag: b99b9e568c8bdc73af2b8016ed03ba5ee83c2030
source-repository-package
type: git
......
......@@ -18,8 +18,7 @@ module Gargantext.Core.Text.Corpus.API
, externalAPIs
) where
import Conduit
import Control.Monad.Except
import Conduit ( ConduitT, yieldMany )
import Data.Text qualified as T
import EPO.API.Client.Types qualified as EPO
import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs)
......@@ -32,7 +31,7 @@ import Gargantext.Core.Text.Corpus.API.Istex qualified as ISTEX
import Gargantext.Core.Text.Corpus.API.OpenAlex qualified as OpenAlex
import Gargantext.Core.Text.Corpus.API.Pubmed qualified as PUBMED
import Gargantext.Core.Text.Corpus.Query qualified as Corpus
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Prelude hiding (get)
import PUBMED.Types qualified as PUBMED
import Servant.Client (ClientError)
......@@ -80,3 +79,5 @@ get externalAPI lang q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do
first ExternalAPIError <$> EPO.get epoAuthKey epoAPIUrl q (toISO639 lang) limit
where
parse_query = first (InvalidInputQuery q . T.pack) $ Corpus.parseQuery q
......@@ -12,29 +12,27 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API.Hal
where
import Conduit
import Data.Either
import Conduit ( (.|), ConduitT, mapMC )
import Data.LanguageCodes qualified as ISO639
import Data.Map.Strict qualified as Map
import Data.Maybe
import Data.Text (pack, intercalate)
import Data.Text (pack)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (intercalate)
import HAL qualified as HAL
import HAL.Client qualified as HAL
import HAL qualified
import HAL.Doc.Corpus qualified as HAL
import HAL.Types qualified as HAL
import Servant.Client (ClientError)
get :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO [HyperdataDocument]
get la q ml = do
eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) la
either (panicTrace . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) eDocs
either (panicTrace . pack . show) (mapM (toDoc' la) . HAL._docs) eDocs
getC :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do
eRes <- HAL.getMetadataWithC [q] (Just 0) (fromIntegral <$> ml) la
eRes <- HAL.getMetadataWithCursorC q (fromIntegral <$> ml) la
pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes
-- case eRes of
-- Left err -> panic $ pack $ show err
......@@ -43,21 +41,21 @@ getC la q ml = do
toDoc' :: Maybe ISO639.ISO639_1 -> HAL.Corpus -> IO HyperdataDocument
toDoc' la (HAL.Corpus { .. }) = do
-- printDebug "[toDoc corpus] h" h
let mDateS = maybe (Just $ pack $ show Defaults.year) Just _corpus_date
let mDateS = _corpus_date <|> Just (pack $ show Defaults.year)
let (utctime, (pub_year, pub_month, pub_day)) = Date.mDateSplit mDateS
let abstractDefault = intercalate " " _corpus_abstract
let abstractDefault = unwords _corpus_abstract
let abstract = case la of
Nothing -> abstractDefault
Just l -> fromMaybe abstractDefault (intercalate " " <$> Map.lookup l _corpus_abstract_lang_map)
Just l -> maybe abstractDefault unwords (Map.lookup l _corpus_abstract_lang_map)
pure HyperdataDocument { _hd_bdd = Just "Hal"
, _hd_doi = Just $ pack $ show _corpus_docid
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just $ intercalate " " _corpus_title
, _hd_authors = Just $ foldl (\x y -> if x == "" then y else x <> ", " <> y) "" _corpus_authors_names
, _hd_institutes = Just $ foldl (\x y -> if x == "" then y else x <> ", " <> y) "" $ _corpus_authors_affiliations <> map show _corpus_struct_id
, _hd_title = Just $ unwords _corpus_title
, _hd_authors = Just $ foldl' (\x y -> if x == "" then y else x <> ", " <> y) "" _corpus_authors_names
, _hd_institutes = Just $ foldl' (\x y -> if x == "" then y else x <> ", " <> y) "" $ _corpus_authors_affiliations <> map show _corpus_struct_id
, _hd_source = Just $ maybe "Nothing" identity _corpus_source
, _hd_abstract = Just abstract
, _hd_publication_date = fmap show utctime
......
......@@ -20,9 +20,9 @@ import Data.List qualified as List
import Data.Text qualified as Text
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex (toDoc)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Prelude hiding (get)
import ISTEX qualified as ISTEX
import ISTEX qualified
import ISTEX.Client qualified as ISTEX
type Query = Text
......@@ -40,14 +40,14 @@ get la query' maxResults = do
-- eDocs <- ISTEX.getMetadataScroll (q <> " abstract:*") "1m" Nothing 0 --(fromIntegral <$> ml)
-- eDocs <- ISTEX.getMetadataScroll q "1m" Nothing 0 --(fromIntegral <$> ml)
let query = case (List.length $ Text.splitOn ":" query') == 1 of
let query = if List.length (Text.splitOn ":" query') == 1 then
-- True case means users is entering default search of IsTex
-- In that case we need to enrich his query with 2 parameters
-- First expected language: user has to define it in GTXT
-- Second : query in abstract
True -> ("language:"<> toISTEXLanguageCode la) <> " AND abstract:"<>query'
("language:"<> toISTEXLanguageCode la) <> " AND abstract:"<>query'
False -> query'
else query'
-- Complex queries of IsTex needs parameters using ":" so we leave the query as it is
-- in that case we suppose user is knowing what s.he is doing
......
......@@ -135,7 +135,7 @@
git: "https://gitlab.iscpif.fr/gargantext/crawlers/epo-proxy-api.git"
subdirs:
- .
- commit: bfa9069b4ff70f341ca3244e8aff9e83eb4b8b73
- commit: b99b9e568c8bdc73af2b8016ed03ba5ee83c2030
git: "https://gitlab.iscpif.fr/gargantext/crawlers/hal.git"
subdirs:
- .
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment