[hal] implement search with language

parent ceb48659
Pipeline #4458 failed with stages
in 8 minutes and 55 seconds
......@@ -84,7 +84,7 @@ source-repository-package
source-repository-package
type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/hal.git
tag: ac1bec2628cd2d6c8357b4af2f853f450c7b6123
tag: f7b928dab9eb14ff1a497d5e092c2224c133979e
source-repository-package
type: git
......
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.35.1.
--
-- see: https://github.com/sol/hpack
name: gargantext
version: 0.0.6.9.9.7.5
version: 0.0.6.9.9.7.5
synopsis: Search, map, share
description: Please see README.md
category: Data
......@@ -47,7 +51,6 @@ library
Gargantext.API.Node
Gargantext.API.Node.Corpus.New
Gargantext.API.Node.Corpus.Types
Gargantext.API.Node.Corpus.Update
Gargantext.API.Node.File
Gargantext.API.Node.Share
Gargantext.API.Prelude
......@@ -59,13 +62,12 @@ library
Gargantext.Core.Text.Context
Gargantext.Core.Text.Corpus.API
Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.OpenAlex
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.API.OpenAlex
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV
Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.List.Formats.CSV
Gargantext.Core.Text.Metrics
Gargantext.Core.Text.Metrics.CharByChar
......@@ -111,7 +113,6 @@ library
Gargantext.Database.Prelude
Gargantext.Database.Query.Table.NgramsPostag
Gargantext.Database.Query.Table.Node
Gargantext.Database.Query.Table.Node.Error
Gargantext.Database.Query.Table.Node.UpdateOpaleye
Gargantext.Database.Query.Table.User
Gargantext.Database.Schema.Ngrams
......@@ -160,6 +161,7 @@ library
Gargantext.API.Node.Corpus.New.File
Gargantext.API.Node.Corpus.New.Types
Gargantext.API.Node.Corpus.Searx
Gargantext.API.Node.Corpus.Update
Gargantext.API.Node.Document.Export
Gargantext.API.Node.Document.Export.Types
Gargantext.API.Node.DocumentsFromWriteNodes
......@@ -205,9 +207,11 @@ library
Gargantext.Core.Text.Corpus.Parsers.GrandDebat
Gargantext.Core.Text.Corpus.Parsers.Iramuteq
Gargantext.Core.Text.Corpus.Parsers.Isidore
Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.Corpus.Parsers.Json2Csv
Gargantext.Core.Text.Corpus.Parsers.RIS
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
Gargantext.Core.Text.Corpus.Parsers.Telegram
Gargantext.Core.Text.Corpus.Parsers.Wikidata
Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
Gargantext.Core.Text.Corpus.Parsers.Wikimedia
......@@ -313,6 +317,7 @@ library
Gargantext.Database.Query.Table.Node.Contact
Gargantext.Database.Query.Table.Node.Document.Add
Gargantext.Database.Query.Table.Node.Document.Insert
Gargantext.Database.Query.Table.Node.Error
Gargantext.Database.Query.Table.Node.Select
Gargantext.Database.Query.Table.Node.Update
Gargantext.Database.Query.Table.Node.User
......@@ -356,6 +361,7 @@ library
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
......@@ -440,6 +446,7 @@ library
, ihaskell
, ini
, insert-ordered-containers
, iso639
, jose
, json-stream
, lens
......@@ -559,6 +566,7 @@ executable gargantext-admin
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
......@@ -588,6 +596,7 @@ executable gargantext-cbor2json
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
......@@ -621,6 +630,7 @@ executable gargantext-cli
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
......@@ -661,6 +671,7 @@ executable gargantext-import
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
......@@ -691,6 +702,7 @@ executable gargantext-init
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
......@@ -721,6 +733,7 @@ executable gargantext-invitations
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
......@@ -750,6 +763,7 @@ executable gargantext-phylo
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
......@@ -793,6 +807,7 @@ executable gargantext-server
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
......@@ -830,6 +845,7 @@ executable gargantext-upgrade
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
......@@ -885,6 +901,7 @@ test-suite garg-test
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
......@@ -897,6 +914,7 @@ test-suite garg-test
FlexibleContexts
FlexibleInstances
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NoImplicitPrelude
OverloadedStrings
......@@ -917,23 +935,15 @@ test-suite garg-test
, gargantext
, gargantext-prelude
, hspec
, hspec-expectations >= 0.8.3
, http-client
, http-client-tls
, monad-control
, mtl
, lens
, parsec
, patches-class
, patches-map
, postgres-options
, postgresql-simple
, quickcheck-instances
, raw-strings-qq
, recover-rtti
, resource-pool
, servant-job
, shelly
, stm
, tasty
, tasty-hspec
......@@ -941,7 +951,6 @@ test-suite garg-test
, tasty-quickcheck
, text
, time
, tmp-postgres
, unordered-containers
, validity
default-language: Haskell2010
This diff is collapsed.
......@@ -14,47 +14,67 @@ module Gargantext.Core.Text.Corpus.API.Hal
import Conduit
import Data.Either
import Data.LanguageCodes qualified as ISO639
import Data.Map.Strict qualified as Map
import Data.Maybe
import Data.Text (Text, pack, intercalate)
import Servant.Client (ClientError)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import qualified Gargantext.Defaults as Defaults
import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified HAL as HAL
import qualified HAL.Client as HAL
import qualified HAL.Doc.Corpus as HAL
import HAL qualified as HAL
import HAL.Client qualified as HAL
import HAL.Doc.Corpus qualified as HAL
import Servant.Client (ClientError)
toLang :: Lang -> Maybe ISO639.ISO639_1
toLang DE = Just ISO639.DE
toLang EL = Just ISO639.EL
toLang EN = Just ISO639.EN
toLang ES = Just ISO639.ES
toLang FR = Just ISO639.FR
toLang IT = Just ISO639.IT
toLang PL = Just ISO639.PL
toLang PT = Just ISO639.PT
toLang RU = Just ISO639.RU
toLang UK = Just ISO639.UK
toLang ZH = Just ISO639.ZH
toLang All = Nothing
get :: Lang -> Text -> Maybe Int -> IO [HyperdataDocument]
get la q ml = do
eDocs <- HAL.getMetadataWith q (Just 0) (fromIntegral <$> ml)
eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) (toLang la)
either (panic . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) eDocs
getC :: Lang -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do
eRes <- HAL.getMetadataWithC q (Just 0) (fromIntegral <$> ml)
eRes <- HAL.getMetadataWithC [q] (Just 0) (fromIntegral <$> ml) (toLang la)
pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes
-- case eRes of
-- Left err -> panic $ pack $ show err
-- Right (len, docsC) -> pure (len, docsC .| mapMC (toDoc' la))
toDoc' :: Lang -> HAL.Corpus -> IO HyperdataDocument
toDoc' la (HAL.Corpus i t ab d s aus affs struct_id) = do
toDoc' la h@(HAL.Corpus { .. }) = do
-- printDebug "[toDoc corpus] h" h
(utctime, (pub_year, pub_month, pub_day)) <-
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) Just d)
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) Just _corpus_date)
let abstractDefault = intercalate " " _corpus_abstract
let abstract = case toLang la of
Nothing -> abstractDefault
Just l -> fromMaybe abstractDefault (intercalate " " <$> Map.lookup l _corpus_abstract_lang_map)
pure HyperdataDocument { _hd_bdd = Just "Hal"
, _hd_doi = Just $ pack $ show i
, _hd_doi = Just $ pack $ show _corpus_docid
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just $ intercalate " " t
, _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" aus
, _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" $ affs <> map (cs . show) struct_id
, _hd_source = Just $ maybe "Nothing" identity s
, _hd_abstract = Just $ intercalate " " ab
, _hd_title = Just $ intercalate " " _corpus_title
, _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" _corpus_authors_names
, _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" $ _corpus_authors_affiliations <> map (cs . show) _corpus_struct_id
, _hd_source = Just $ maybe "Nothing" identity _corpus_source
, _hd_abstract = Just abstract
, _hd_publication_date = fmap (pack . show) utctime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
......
......@@ -59,6 +59,7 @@ tokenTag2terms (TokenTag ws t _ _) = Terms ws t
tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = tokenTagsWith EN txt $ corenlp url
tokenTags (NLPServerConfig { server = CoreNLP, url }) FR txt = tokenTagsWith FR txt $ corenlp url
tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
-- printDebug "NLP Debug" txt
tokenTagsWith l txt $ SpacyNLP.nlp url
......@@ -95,4 +96,3 @@ cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls
removeUrls = removeUrlsWith "http" . removeUrlsWith "www"
removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment