Commit 39ea62eb authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/327-dev-rewrite-hal-crawler' into dev

parents 25a14cb2 629d7af7
......@@ -18,7 +18,7 @@ fi
# with the `sha256sum` result calculated on the `cabal.project` and
# `cabal.project.freeze`. This ensures the files stay deterministic so that CI
# cache can kick in.
expected_cabal_project_hash="1cbb47fd3f929a01b3b968cc2e148dcbf5ef4e662e14ed9832d32471a68f6766"
expected_cabal_project_hash="3bfa2552464823ff4f1d892e9dc2778a9cbf1a153a6639ec9caf87e6d9c75a7b"
expected_cabal_project_freeze_hash="2c8960ffcf1b94aa11a3543e3b5facd2db5af19569fecaec4bc0ab4c1edd22a5"
cabal --store-dir=$STORE_DIR v2-build --dry-run
......
......@@ -106,7 +106,7 @@ source-repository-package
source-repository-package
type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/hal.git
tag: bfa9069b4ff70f341ca3244e8aff9e83eb4b8b73
tag: b99b9e568c8bdc73af2b8016ed03ba5ee83c2030
source-repository-package
type: git
......
......@@ -18,8 +18,7 @@ module Gargantext.Core.Text.Corpus.API
, externalAPIs
) where
import Conduit
import Control.Monad.Except
import Conduit ( ConduitT, yieldMany )
import Data.Text qualified as T
import EPO.API.Client.Types qualified as EPO
import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs)
......@@ -32,7 +31,7 @@ import Gargantext.Core.Text.Corpus.API.Istex qualified as ISTEX
import Gargantext.Core.Text.Corpus.API.OpenAlex qualified as OpenAlex
import Gargantext.Core.Text.Corpus.API.Pubmed qualified as PUBMED
import Gargantext.Core.Text.Corpus.Query qualified as Corpus
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Prelude hiding (get)
import PUBMED.Types qualified as PUBMED
import Servant.Client (ClientError)
......@@ -80,3 +79,5 @@ get externalAPI lang q mPubmedAPIKey epoAuthKey epoAPIUrl limit = do
first ExternalAPIError <$> EPO.get epoAuthKey epoAPIUrl q (toISO639 lang) limit
where
parse_query = first (InvalidInputQuery q . T.pack) $ Corpus.parseQuery q
......@@ -20,9 +20,9 @@ import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (intercalate)
import HAL qualified as HAL
import HAL.Client qualified as HAL
import HAL qualified
import HAL.Doc.Corpus qualified as HAL
import HAL.Types qualified as HAL
import Servant.Client (ClientError)
get :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO [HyperdataDocument]
......@@ -32,7 +32,7 @@ get la q ml = do
getC :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do
eRes <- HAL.getMetadataWithC [q] (Just 0) (fromIntegral <$> ml) la
eRes <- HAL.getMetadataWithCursorC q (fromIntegral <$> ml) la
pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes
-- case eRes of
-- Left err -> panic $ pack $ show err
......@@ -41,7 +41,7 @@ getC la q ml = do
toDoc' :: Maybe ISO639.ISO639_1 -> HAL.Corpus -> IO HyperdataDocument
toDoc' la (HAL.Corpus { .. }) = do
-- printDebug "[toDoc corpus] h" h
let mDateS = maybe (Just $ pack $ show Defaults.year) Just _corpus_date
let mDateS = _corpus_date <|> Just (pack $ show Defaults.year)
let (utctime, (pub_year, pub_month, pub_day)) = Date.mDateSplit mDateS
let abstractDefault = unwords _corpus_abstract
let abstract = case la of
......@@ -52,8 +52,8 @@ toDoc' la (HAL.Corpus { .. }) = do
, _hd_url = Nothing
, _hd_page = Nothing
, _hd_title = Just $ unwords _corpus_title
, _hd_authors = Just $ foldl (\x y -> if x == "" then y else x <> ", " <> y) "" _corpus_authors_names
, _hd_institutes = Just $ foldl (\x y -> if x == "" then y else x <> ", " <> y) "" $ _corpus_authors_affiliations <> map show _corpus_struct_id
, _hd_authors = Just $ foldl' (\x y -> if x == "" then y else x <> ", " <> y) "" _corpus_authors_names
, _hd_institutes = Just $ foldl' (\x y -> if x == "" then y else x <> ", " <> y) "" $ _corpus_authors_affiliations <> map show _corpus_struct_id
, _hd_source = Just $ maybe "Nothing" identity _corpus_source
, _hd_abstract = Just abstract
, _hd_publication_date = fmap show utctime
......
......@@ -20,9 +20,9 @@ import Data.List qualified as List
import Data.Text qualified as Text
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex (toDoc)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Prelude hiding (get)
import ISTEX qualified as ISTEX
import ISTEX qualified
import ISTEX.Client qualified as ISTEX
type Query = Text
......@@ -40,14 +40,14 @@ get la query' maxResults = do
-- eDocs <- ISTEX.getMetadataScroll (q <> " abstract:*") "1m" Nothing 0 --(fromIntegral <$> ml)
-- eDocs <- ISTEX.getMetadataScroll q "1m" Nothing 0 --(fromIntegral <$> ml)
let query = case (List.length $ Text.splitOn ":" query') == 1 of
let query = if List.length (Text.splitOn ":" query') == 1 then
-- True case means users is entering default search of IsTex
-- In that case we need to enrich his query with 2 parameters
-- First expected language: user has to define it in GTXT
-- Second : query in abstract
True -> ("language:"<> toISTEXLanguageCode la) <> " AND abstract:"<>query'
("language:"<> toISTEXLanguageCode la) <> " AND abstract:"<>query'
False -> query'
else query'
-- Complex queries of IsTex needs parameters using ":" so we leave the query as it is
-- in that case we suppose user is knowing what s.he is doing
......
......@@ -135,7 +135,7 @@
git: "https://gitlab.iscpif.fr/gargantext/crawlers/epo-proxy-api.git"
subdirs:
- .
- commit: bfa9069b4ff70f341ca3244e8aff9e83eb4b8b73
- commit: b99b9e568c8bdc73af2b8016ed03ba5ee83c2030
git: "https://gitlab.iscpif.fr/gargantext/crawlers/hal.git"
subdirs:
- .
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment