[hal] implement search with language

parent ceb48659
...@@ -84,7 +84,7 @@ source-repository-package ...@@ -84,7 +84,7 @@ source-repository-package
source-repository-package source-repository-package
type: git type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/hal.git location: https://gitlab.iscpif.fr/gargantext/crawlers/hal.git
tag: ac1bec2628cd2d6c8357b4af2f853f450c7b6123 tag: f7b928dab9eb14ff1a497d5e092c2224c133979e
source-repository-package source-repository-package
type: git type: git
......
cabal-version: 1.12 cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.35.1.
--
-- see: https://github.com/sol/hpack
name: gargantext name: gargantext
version: 0.0.6.9.9.7.5 version: 0.0.6.9.9.7.5
synopsis: Search, map, share synopsis: Search, map, share
description: Please see README.md description: Please see README.md
category: Data category: Data
...@@ -47,7 +51,6 @@ library ...@@ -47,7 +51,6 @@ library
Gargantext.API.Node Gargantext.API.Node
Gargantext.API.Node.Corpus.New Gargantext.API.Node.Corpus.New
Gargantext.API.Node.Corpus.Types Gargantext.API.Node.Corpus.Types
Gargantext.API.Node.Corpus.Update
Gargantext.API.Node.File Gargantext.API.Node.File
Gargantext.API.Node.Share Gargantext.API.Node.Share
Gargantext.API.Prelude Gargantext.API.Prelude
...@@ -59,13 +62,12 @@ library ...@@ -59,13 +62,12 @@ library
Gargantext.Core.Text.Context Gargantext.Core.Text.Context
Gargantext.Core.Text.Corpus.API Gargantext.Core.Text.Corpus.API
Gargantext.Core.Text.Corpus.API.Arxiv Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.OpenAlex
Gargantext.Core.Text.Corpus.API.Pubmed Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.API.OpenAlex
Gargantext.Core.Text.Corpus.Query Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.Corpus.Parsers Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV Gargantext.Core.Text.Corpus.Parsers.CSV
Gargantext.Core.Text.Corpus.Parsers.Date.Parsec Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.List.Formats.CSV Gargantext.Core.Text.List.Formats.CSV
Gargantext.Core.Text.Metrics Gargantext.Core.Text.Metrics
Gargantext.Core.Text.Metrics.CharByChar Gargantext.Core.Text.Metrics.CharByChar
...@@ -111,7 +113,6 @@ library ...@@ -111,7 +113,6 @@ library
Gargantext.Database.Prelude Gargantext.Database.Prelude
Gargantext.Database.Query.Table.NgramsPostag Gargantext.Database.Query.Table.NgramsPostag
Gargantext.Database.Query.Table.Node Gargantext.Database.Query.Table.Node
Gargantext.Database.Query.Table.Node.Error
Gargantext.Database.Query.Table.Node.UpdateOpaleye Gargantext.Database.Query.Table.Node.UpdateOpaleye
Gargantext.Database.Query.Table.User Gargantext.Database.Query.Table.User
Gargantext.Database.Schema.Ngrams Gargantext.Database.Schema.Ngrams
...@@ -160,6 +161,7 @@ library ...@@ -160,6 +161,7 @@ library
Gargantext.API.Node.Corpus.New.File Gargantext.API.Node.Corpus.New.File
Gargantext.API.Node.Corpus.New.Types Gargantext.API.Node.Corpus.New.Types
Gargantext.API.Node.Corpus.Searx Gargantext.API.Node.Corpus.Searx
Gargantext.API.Node.Corpus.Update
Gargantext.API.Node.Document.Export Gargantext.API.Node.Document.Export
Gargantext.API.Node.Document.Export.Types Gargantext.API.Node.Document.Export.Types
Gargantext.API.Node.DocumentsFromWriteNodes Gargantext.API.Node.DocumentsFromWriteNodes
...@@ -205,9 +207,11 @@ library ...@@ -205,9 +207,11 @@ library
Gargantext.Core.Text.Corpus.Parsers.GrandDebat Gargantext.Core.Text.Corpus.Parsers.GrandDebat
Gargantext.Core.Text.Corpus.Parsers.Iramuteq Gargantext.Core.Text.Corpus.Parsers.Iramuteq
Gargantext.Core.Text.Corpus.Parsers.Isidore Gargantext.Core.Text.Corpus.Parsers.Isidore
Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.Corpus.Parsers.Json2Csv Gargantext.Core.Text.Corpus.Parsers.Json2Csv
Gargantext.Core.Text.Corpus.Parsers.RIS Gargantext.Core.Text.Corpus.Parsers.RIS
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
Gargantext.Core.Text.Corpus.Parsers.Telegram
Gargantext.Core.Text.Corpus.Parsers.Wikidata Gargantext.Core.Text.Corpus.Parsers.Wikidata
Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
Gargantext.Core.Text.Corpus.Parsers.Wikimedia Gargantext.Core.Text.Corpus.Parsers.Wikimedia
...@@ -313,6 +317,7 @@ library ...@@ -313,6 +317,7 @@ library
Gargantext.Database.Query.Table.Node.Contact Gargantext.Database.Query.Table.Node.Contact
Gargantext.Database.Query.Table.Node.Document.Add Gargantext.Database.Query.Table.Node.Document.Add
Gargantext.Database.Query.Table.Node.Document.Insert Gargantext.Database.Query.Table.Node.Document.Insert
Gargantext.Database.Query.Table.Node.Error
Gargantext.Database.Query.Table.Node.Select Gargantext.Database.Query.Table.Node.Select
Gargantext.Database.Query.Table.Node.Update Gargantext.Database.Query.Table.Node.Update
Gargantext.Database.Query.Table.Node.User Gargantext.Database.Query.Table.Node.User
...@@ -356,6 +361,7 @@ library ...@@ -356,6 +361,7 @@ library
FlexibleInstances FlexibleInstances
GADTs GADTs
GeneralizedNewtypeDeriving GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses MultiParamTypeClasses
NamedFieldPuns NamedFieldPuns
NoImplicitPrelude NoImplicitPrelude
...@@ -440,6 +446,7 @@ library ...@@ -440,6 +446,7 @@ library
, ihaskell , ihaskell
, ini , ini
, insert-ordered-containers , insert-ordered-containers
, iso639
, jose , jose
, json-stream , json-stream
, lens , lens
...@@ -559,6 +566,7 @@ executable gargantext-admin ...@@ -559,6 +566,7 @@ executable gargantext-admin
FlexibleInstances FlexibleInstances
GADTs GADTs
GeneralizedNewtypeDeriving GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses MultiParamTypeClasses
NamedFieldPuns NamedFieldPuns
NoImplicitPrelude NoImplicitPrelude
...@@ -588,6 +596,7 @@ executable gargantext-cbor2json ...@@ -588,6 +596,7 @@ executable gargantext-cbor2json
FlexibleInstances FlexibleInstances
GADTs GADTs
GeneralizedNewtypeDeriving GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses MultiParamTypeClasses
NamedFieldPuns NamedFieldPuns
NoImplicitPrelude NoImplicitPrelude
...@@ -621,6 +630,7 @@ executable gargantext-cli ...@@ -621,6 +630,7 @@ executable gargantext-cli
FlexibleInstances FlexibleInstances
GADTs GADTs
GeneralizedNewtypeDeriving GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses MultiParamTypeClasses
NamedFieldPuns NamedFieldPuns
NoImplicitPrelude NoImplicitPrelude
...@@ -661,6 +671,7 @@ executable gargantext-import ...@@ -661,6 +671,7 @@ executable gargantext-import
FlexibleInstances FlexibleInstances
GADTs GADTs
GeneralizedNewtypeDeriving GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses MultiParamTypeClasses
NamedFieldPuns NamedFieldPuns
NoImplicitPrelude NoImplicitPrelude
...@@ -691,6 +702,7 @@ executable gargantext-init ...@@ -691,6 +702,7 @@ executable gargantext-init
FlexibleInstances FlexibleInstances
GADTs GADTs
GeneralizedNewtypeDeriving GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses MultiParamTypeClasses
NamedFieldPuns NamedFieldPuns
NoImplicitPrelude NoImplicitPrelude
...@@ -721,6 +733,7 @@ executable gargantext-invitations ...@@ -721,6 +733,7 @@ executable gargantext-invitations
FlexibleInstances FlexibleInstances
GADTs GADTs
GeneralizedNewtypeDeriving GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses MultiParamTypeClasses
NamedFieldPuns NamedFieldPuns
NoImplicitPrelude NoImplicitPrelude
...@@ -750,6 +763,7 @@ executable gargantext-phylo ...@@ -750,6 +763,7 @@ executable gargantext-phylo
FlexibleInstances FlexibleInstances
GADTs GADTs
GeneralizedNewtypeDeriving GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses MultiParamTypeClasses
NamedFieldPuns NamedFieldPuns
NoImplicitPrelude NoImplicitPrelude
...@@ -793,6 +807,7 @@ executable gargantext-server ...@@ -793,6 +807,7 @@ executable gargantext-server
FlexibleInstances FlexibleInstances
GADTs GADTs
GeneralizedNewtypeDeriving GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses MultiParamTypeClasses
NamedFieldPuns NamedFieldPuns
NoImplicitPrelude NoImplicitPrelude
...@@ -830,6 +845,7 @@ executable gargantext-upgrade ...@@ -830,6 +845,7 @@ executable gargantext-upgrade
FlexibleInstances FlexibleInstances
GADTs GADTs
GeneralizedNewtypeDeriving GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses MultiParamTypeClasses
NamedFieldPuns NamedFieldPuns
NoImplicitPrelude NoImplicitPrelude
...@@ -885,6 +901,7 @@ test-suite garg-test ...@@ -885,6 +901,7 @@ test-suite garg-test
FlexibleInstances FlexibleInstances
GADTs GADTs
GeneralizedNewtypeDeriving GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses MultiParamTypeClasses
NamedFieldPuns NamedFieldPuns
NoImplicitPrelude NoImplicitPrelude
...@@ -897,6 +914,7 @@ test-suite garg-test ...@@ -897,6 +914,7 @@ test-suite garg-test
FlexibleContexts FlexibleContexts
FlexibleInstances FlexibleInstances
GeneralizedNewtypeDeriving GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses MultiParamTypeClasses
NoImplicitPrelude NoImplicitPrelude
OverloadedStrings OverloadedStrings
...@@ -917,23 +935,15 @@ test-suite garg-test ...@@ -917,23 +935,15 @@ test-suite garg-test
, gargantext , gargantext
, gargantext-prelude , gargantext-prelude
, hspec , hspec
, hspec-expectations >= 0.8.3
, http-client , http-client
, http-client-tls , http-client-tls
, monad-control
, mtl , mtl
, lens
, parsec , parsec
, patches-class , patches-class
, patches-map , patches-map
, postgres-options
, postgresql-simple
, quickcheck-instances , quickcheck-instances
, raw-strings-qq , raw-strings-qq
, recover-rtti
, resource-pool
, servant-job , servant-job
, shelly
, stm , stm
, tasty , tasty
, tasty-hspec , tasty-hspec
...@@ -941,7 +951,6 @@ test-suite garg-test ...@@ -941,7 +951,6 @@ test-suite garg-test
, tasty-quickcheck , tasty-quickcheck
, text , text
, time , time
, tmp-postgres
, unordered-containers , unordered-containers
, validity , validity
default-language: Haskell2010 default-language: Haskell2010
This diff is collapsed.
...@@ -14,47 +14,67 @@ module Gargantext.Core.Text.Corpus.API.Hal ...@@ -14,47 +14,67 @@ module Gargantext.Core.Text.Corpus.API.Hal
import Conduit import Conduit
import Data.Either import Data.Either
import Data.LanguageCodes qualified as ISO639
import Data.Map.Strict qualified as Map
import Data.Maybe import Data.Maybe
import Data.Text (Text, pack, intercalate) import Data.Text (Text, pack, intercalate)
import Servant.Client (ClientError)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import qualified Gargantext.Defaults as Defaults import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude import Gargantext.Prelude
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date import HAL qualified as HAL
import qualified HAL as HAL import HAL.Client qualified as HAL
import qualified HAL.Client as HAL import HAL.Doc.Corpus qualified as HAL
import qualified HAL.Doc.Corpus as HAL import Servant.Client (ClientError)
toLang :: Lang -> Maybe ISO639.ISO639_1
toLang DE = Just ISO639.DE
toLang EL = Just ISO639.EL
toLang EN = Just ISO639.EN
toLang ES = Just ISO639.ES
toLang FR = Just ISO639.FR
toLang IT = Just ISO639.IT
toLang PL = Just ISO639.PL
toLang PT = Just ISO639.PT
toLang RU = Just ISO639.RU
toLang UK = Just ISO639.UK
toLang ZH = Just ISO639.ZH
toLang All = Nothing
get :: Lang -> Text -> Maybe Int -> IO [HyperdataDocument] get :: Lang -> Text -> Maybe Int -> IO [HyperdataDocument]
get la q ml = do get la q ml = do
eDocs <- HAL.getMetadataWith q (Just 0) (fromIntegral <$> ml) eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) (toLang la)
either (panic . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) eDocs either (panic . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) eDocs
getC :: Lang -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ())) getC :: Lang -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do getC la q ml = do
eRes <- HAL.getMetadataWithC q (Just 0) (fromIntegral <$> ml) eRes <- HAL.getMetadataWithC [q] (Just 0) (fromIntegral <$> ml) (toLang la)
pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes
-- case eRes of -- case eRes of
-- Left err -> panic $ pack $ show err -- Left err -> panic $ pack $ show err
-- Right (len, docsC) -> pure (len, docsC .| mapMC (toDoc' la)) -- Right (len, docsC) -> pure (len, docsC .| mapMC (toDoc' la))
toDoc' :: Lang -> HAL.Corpus -> IO HyperdataDocument toDoc' :: Lang -> HAL.Corpus -> IO HyperdataDocument
toDoc' la (HAL.Corpus i t ab d s aus affs struct_id) = do toDoc' la h@(HAL.Corpus { .. }) = do
-- printDebug "[toDoc corpus] h" h
(utctime, (pub_year, pub_month, pub_day)) <- (utctime, (pub_year, pub_month, pub_day)) <-
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) Just d) Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) Just _corpus_date)
let abstractDefault = intercalate " " _corpus_abstract
let abstract = case toLang la of
Nothing -> abstractDefault
Just l -> fromMaybe abstractDefault (intercalate " " <$> Map.lookup l _corpus_abstract_lang_map)
pure HyperdataDocument { _hd_bdd = Just "Hal" pure HyperdataDocument { _hd_bdd = Just "Hal"
, _hd_doi = Just $ pack $ show i , _hd_doi = Just $ pack $ show _corpus_docid
, _hd_url = Nothing , _hd_url = Nothing
, _hd_uniqId = Nothing , _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing , _hd_uniqIdBdd = Nothing
, _hd_page = Nothing , _hd_page = Nothing
, _hd_title = Just $ intercalate " " t , _hd_title = Just $ intercalate " " _corpus_title
, _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" aus , _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" _corpus_authors_names
, _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" $ affs <> map (cs . show) struct_id , _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" $ _corpus_authors_affiliations <> map (cs . show) _corpus_struct_id
, _hd_source = Just $ maybe "Nothing" identity s , _hd_source = Just $ maybe "Nothing" identity _corpus_source
, _hd_abstract = Just $ intercalate " " ab , _hd_abstract = Just abstract
, _hd_publication_date = fmap (pack . show) utctime , _hd_publication_date = fmap (pack . show) utctime
, _hd_publication_year = pub_year , _hd_publication_year = pub_year
, _hd_publication_month = pub_month , _hd_publication_month = pub_month
......
...@@ -59,6 +59,7 @@ tokenTag2terms (TokenTag ws t _ _) = Terms ws t ...@@ -59,6 +59,7 @@ tokenTag2terms (TokenTag ws t _ _) = Terms ws t
tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]] tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = tokenTagsWith EN txt $ corenlp url tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = tokenTagsWith EN txt $ corenlp url
tokenTags (NLPServerConfig { server = CoreNLP, url }) FR txt = tokenTagsWith FR txt $ corenlp url
tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
-- printDebug "NLP Debug" txt -- printDebug "NLP Debug" txt
tokenTagsWith l txt $ SpacyNLP.nlp url tokenTagsWith l txt $ SpacyNLP.nlp url
...@@ -95,4 +96,3 @@ cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls ...@@ -95,4 +96,3 @@ cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls
removeUrls = removeUrlsWith "http" . removeUrlsWith "www" removeUrls = removeUrlsWith "http" . removeUrlsWith "www"
removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space) removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment