Commit 6241198d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] OpenAlex merge

parents 00df9697 b2f9e554
...@@ -10,7 +10,8 @@ STORE_DIR="${1:-$DEFAULT_STORE}" ...@@ -10,7 +10,8 @@ STORE_DIR="${1:-$DEFAULT_STORE}"
# changes, you have to make sure to update the `expected_cabal_projet_hash` with the # changes, you have to make sure to update the `expected_cabal_projet_hash` with the
# `sha256sum` result calculated on the `cabal.project`. This ensures the `cabal.project` # `sha256sum` result calculated on the `cabal.project`. This ensures the `cabal.project`
# stays deterministic so that CI cache can kick in. # stays deterministic so that CI cache can kick in.
expected_cabal_project_hash="2754bf61cc7a2aa7b29345ffe34dc1e90a06426f00fc39da9f793cd828be4e15" #expected_cabal_project_hash="2754bf61cc7a2aa7b29345ffe34dc1e90a06426f00fc39da9f793cd828be4e15"
expected_cabal_project_hash="5e989e199765ba2dd476208a66e96495ade69eb7cb14c0a448dfebd5748c9b39"
cabal --store-dir=$STORE_DIR v2-update 'hackage.haskell.org,2023-06-24T21:28:46Z' cabal --store-dir=$STORE_DIR v2-update 'hackage.haskell.org,2023-06-24T21:28:46Z'
......
...@@ -99,7 +99,7 @@ source-repository-package ...@@ -99,7 +99,7 @@ source-repository-package
source-repository-package source-repository-package
type: git type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/openalex.git location: https://gitlab.iscpif.fr/gargantext/crawlers/openalex.git
tag: dab07cb89e8ab8eaaff8619f5e21d944d9c526ab tag: 1cf872fb3bd0e3e44af31247833c4b6bb7d0dca5
source-repository-package source-repository-package
type: git type: git
......
...@@ -964,3 +964,4 @@ test-suite garg-test ...@@ -964,3 +964,4 @@ test-suite garg-test
, unordered-containers ^>= 0.2.16.0 , unordered-containers ^>= 0.2.16.0
, validity ^>= 0.11.0.1 , validity ^>= 0.11.0.1
default-language: Haskell2010 default-language: Haskell2010
...@@ -93,7 +93,7 @@ documentUpload nId doc = do ...@@ -93,7 +93,7 @@ documentUpload nId doc = do
Just c -> c Just c -> c
Nothing -> panic $ T.pack $ "[G.A.N.DU] Node has no corpus parent: " <> show nId Nothing -> panic $ T.pack $ "[G.A.N.DU] Node has no corpus parent: " <> show nId
(theFullDate, (year, month, day)) <- liftBase $ dateSplit EN (theFullDate, (year, month, day)) <- liftBase $ dateSplit
$ Just $ Just
$ view du_date doc $ view du_date doc
......
...@@ -17,6 +17,7 @@ module Gargantext.Core ...@@ -17,6 +17,7 @@ module Gargantext.Core
import Data.Aeson import Data.Aeson
import Data.Either(Either(Left)) import Data.Either(Either(Left))
import Data.Hashable (Hashable) import Data.Hashable (Hashable)
import Data.LanguageCodes qualified as ISO639
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import Data.Morpheus.Types (GQLType) import Data.Morpheus.Types (GQLType)
import Data.Swagger import Data.Swagger
...@@ -45,6 +46,7 @@ import qualified Data.Map as Map ...@@ -45,6 +46,7 @@ import qualified Data.Map as Map
-- | All languages supported -- | All languages supported
-- NOTE: Use international country codes -- NOTE: Use international country codes
-- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes -- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
-- TODO This should be deprecated in favor of iso-639 library
data Lang = All data Lang = All
| DE | DE
| EL | EL
...@@ -93,6 +95,28 @@ instance Hashable Lang ...@@ -93,6 +95,28 @@ instance Hashable Lang
instance Arbitrary Lang where instance Arbitrary Lang where
arbitrary = arbitraryBoundedEnum arbitrary = arbitraryBoundedEnum
toISO639 :: Lang -> Maybe ISO639.ISO639_1
toISO639 DE = Just ISO639.DE
toISO639 EL = Just ISO639.EL
toISO639 EN = Just ISO639.EN
toISO639 ES = Just ISO639.ES
toISO639 FR = Just ISO639.FR
toISO639 IT = Just ISO639.IT
toISO639 PL = Just ISO639.PL
toISO639 PT = Just ISO639.PT
toISO639 RU = Just ISO639.RU
toISO639 UK = Just ISO639.UK
toISO639 ZH = Just ISO639.ZH
toISO639 All = Nothing
toISO639EN :: Lang -> ISO639.ISO639_1
toISO639EN l = fromMaybe ISO639.EN $ toISO639 l
iso639ToText :: ISO639.ISO639_1 -> Text
iso639ToText la = pack [a, b]
where
(a, b) = ISO639.toChars la
-- | https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes -- | https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
toISO639Lang :: Lang -> Maybe Text toISO639Lang :: Lang -> Maybe Text
toISO639Lang All = Nothing toISO639Lang All = Nothing
......
...@@ -24,7 +24,7 @@ import Data.Either (Either(..)) ...@@ -24,7 +24,7 @@ import Data.Either (Either(..))
import Data.Maybe import Data.Maybe
import qualified Data.Text as T import qualified Data.Text as T
import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs) import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..), toISO639)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude import Gargantext.Prelude
import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv
...@@ -57,13 +57,13 @@ get externalAPI la q mPubmedAPIKey limit = do ...@@ -57,13 +57,13 @@ get externalAPI la q mPubmedAPIKey limit = do
Left err -> pure $ Left $ InvalidInputQuery q (T.pack err) Left err -> pure $ Left $ InvalidInputQuery q (T.pack err)
Right corpusQuery -> case externalAPI of Right corpusQuery -> case externalAPI of
OpenAlex -> first ExternalAPIError <$> OpenAlex -> first ExternalAPIError <$>
OpenAlex.get (fromMaybe "" Nothing {- email -}) q la limit OpenAlex.get (fromMaybe "" Nothing {- email -}) q (toISO639 la) limit
PubMed -> first ExternalAPIError <$> PubMed -> first ExternalAPIError <$>
PUBMED.get (fromMaybe "" mPubmedAPIKey) corpusQuery limit PUBMED.get (fromMaybe "" mPubmedAPIKey) corpusQuery limit
--docs <- PUBMED.get q default_limit -- EN only by default --docs <- PUBMED.get q default_limit -- EN only by default
--pure (Just $ fromIntegral $ length docs, yieldMany docs) --pure (Just $ fromIntegral $ length docs, yieldMany docs)
Arxiv -> Right <$> Arxiv.get la corpusQuery limit Arxiv -> Right <$> Arxiv.get la corpusQuery limit
HAL -> first ExternalAPIError <$> HAL.getC la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit) HAL -> first ExternalAPIError <$> HAL.getC (toISO639 la) (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
IsTex -> do docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit) IsTex -> do docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs) pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
Isidore -> do docs <- ISIDORE.get la (Corpus.getLimit <$> limit) (Just $ Corpus.getRawQuery q) Nothing Isidore -> do docs <- ISIDORE.get la (Corpus.getLimit <$> limit) (Just $ Corpus.getRawQuery q) Nothing
......
...@@ -18,7 +18,7 @@ import Data.LanguageCodes qualified as ISO639 ...@@ -18,7 +18,7 @@ import Data.LanguageCodes qualified as ISO639
import Data.Map.Strict qualified as Map import Data.Map.Strict qualified as Map
import Data.Maybe import Data.Maybe
import Data.Text (Text, pack, intercalate) import Data.Text (Text, pack, intercalate)
import Gargantext.Core (Lang(..)) -- import Gargantext.Core (Lang(..), toISO639Lang)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Defaults qualified as Defaults import Gargantext.Defaults qualified as Defaults
...@@ -28,40 +28,26 @@ import HAL.Client qualified as HAL ...@@ -28,40 +28,26 @@ import HAL.Client qualified as HAL
import HAL.Doc.Corpus qualified as HAL import HAL.Doc.Corpus qualified as HAL
import Servant.Client (ClientError) import Servant.Client (ClientError)
toLang :: Lang -> Maybe ISO639.ISO639_1 get :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO [HyperdataDocument]
toLang DE = Just ISO639.DE
toLang EL = Just ISO639.EL
toLang EN = Just ISO639.EN
toLang ES = Just ISO639.ES
toLang FR = Just ISO639.FR
toLang IT = Just ISO639.IT
toLang PL = Just ISO639.PL
toLang PT = Just ISO639.PT
toLang RU = Just ISO639.RU
toLang UK = Just ISO639.UK
toLang ZH = Just ISO639.ZH
toLang All = Nothing
get :: Lang -> Text -> Maybe Int -> IO [HyperdataDocument]
get la q ml = do get la q ml = do
eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) (toLang la) eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) la
either (panic . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) eDocs either (panic . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) eDocs
getC :: Lang -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ())) getC :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do getC la q ml = do
eRes <- HAL.getMetadataWithC [q] (Just 0) (fromIntegral <$> ml) (toLang la) eRes <- HAL.getMetadataWithC [q] (Just 0) (fromIntegral <$> ml) la
pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes
-- case eRes of -- case eRes of
-- Left err -> panic $ pack $ show err -- Left err -> panic $ pack $ show err
-- Right (len, docsC) -> pure (len, docsC .| mapMC (toDoc' la)) -- Right (len, docsC) -> pure (len, docsC .| mapMC (toDoc' la))
toDoc' :: Lang -> HAL.Corpus -> IO HyperdataDocument toDoc' :: Maybe ISO639.ISO639_1 -> HAL.Corpus -> IO HyperdataDocument
toDoc' la (HAL.Corpus { .. }) = do toDoc' la (HAL.Corpus { .. }) = do
-- printDebug "[toDoc corpus] h" h -- printDebug "[toDoc corpus] h" h
(utctime, (pub_year, pub_month, pub_day)) <- (utctime, (pub_year, pub_month, pub_day)) <-
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) Just _corpus_date) Date.dateSplit (maybe (Just $ pack $ show Defaults.year) Just _corpus_date)
let abstractDefault = intercalate " " _corpus_abstract let abstractDefault = intercalate " " _corpus_abstract
let abstract = case toLang la of let abstract = case la of
Nothing -> abstractDefault Nothing -> abstractDefault
Just l -> fromMaybe abstractDefault (intercalate " " <$> Map.lookup l _corpus_abstract_lang_map) Just l -> fromMaybe abstractDefault (intercalate " " <$> Map.lookup l _corpus_abstract_lang_map)
pure HyperdataDocument { _hd_bdd = Just "Hal" pure HyperdataDocument { _hd_bdd = Just "Hal"
......
...@@ -35,13 +35,13 @@ get la l q a = do ...@@ -35,13 +35,13 @@ get la l q a = do
let let
printErr (DecodeFailure e _) = panic e printErr (DecodeFailure e _) = panic e
printErr e = panic (cs $ show e) printErr e = panic (cs $ show e)
toIsidoreDocs :: Reply -> [IsidoreDoc] toIsidoreDocs :: Reply -> [IsidoreDoc]
toIsidoreDocs (ReplyOnly r) = [r] toIsidoreDocs (ReplyOnly r) = [r]
toIsidoreDocs (Replies rs) = rs toIsidoreDocs (Replies rs) = rs
iDocs <- either printErr _content <$> Isidore.get l q a iDocs <- either printErr _content <$> Isidore.get l q a
hDocs <- mapM (\d -> isidoreToDoc la d) (toIsidoreDocs iDocs) hDocs <- mapM (\d -> isidoreToDoc la d) (toIsidoreDocs iDocs)
pure hDocs pure hDocs
...@@ -58,7 +58,7 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do ...@@ -58,7 +58,7 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
author :: Author -> Text author :: Author -> Text
author (Author fn ln) = (_name fn) <> ", " <> (_name ln) author (Author fn ln) = (_name fn) <> ", " <> (_name ln)
author (Authors aus) = Text.intercalate ". " $ map author aus author (Authors aus) = Text.intercalate ". " $ map author aus
creator2text :: Creator -> Text creator2text :: Creator -> Text
creator2text (Creator au) = author au creator2text (Creator au) = author au
creator2text (Creators aus') = Text.intercalate ". " $ map author aus' creator2text (Creators aus') = Text.intercalate ". " $ map author aus'
...@@ -67,9 +67,9 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do ...@@ -67,9 +67,9 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
langText (LangText _l t1) = t1 langText (LangText _l t1) = t1
langText (OnlyText t2 ) = t2 langText (OnlyText t2 ) = t2
langText (ArrayText ts ) = Text.intercalate " " $ map langText ts langText (ArrayText ts ) = Text.intercalate " " $ map langText ts
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit l (maybe (Just $ Text.pack $ show Defaults.year) (Just) d) (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit (maybe (Just $ Text.pack $ show Defaults.year) (Just) d)
pure HyperdataDocument pure HyperdataDocument
{ _hd_bdd = Just "Isidore" { _hd_bdd = Just "Isidore"
, _hd_doi = Nothing , _hd_doi = Nothing
...@@ -91,5 +91,3 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do ...@@ -91,5 +91,3 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
, _hd_publication_second = Nothing , _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (Text.pack . show) l , _hd_language_iso2 = Just $ (Text.pack . show) l
} }
...@@ -82,7 +82,7 @@ toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument ...@@ -82,7 +82,7 @@ toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument
toDoc la (ISTEX.Document i t a ab d s) = do toDoc la (ISTEX.Document i t a ab d s) = do
--printDebug "ISTEX date" d --printDebug "ISTEX date" d
(utctime, (pub_year, pub_month, pub_day)) <- (utctime, (pub_year, pub_month, pub_day)) <-
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d) Date.dateSplit (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d)
--printDebug "toDoc Istex" (utctime, (pub_year, pub_month, pub_day)) --printDebug "toDoc Istex" (utctime, (pub_year, pub_month, pub_day))
pure $ HyperdataDocument { _hd_bdd = Just "Istex" pure $ HyperdataDocument { _hd_bdd = Just "Istex"
, _hd_doi = Just i , _hd_doi = Just i
......
...@@ -10,8 +10,9 @@ Portability : POSIX ...@@ -10,8 +10,9 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API.OpenAlex where module Gargantext.Core.Text.Corpus.API.OpenAlex where
import Conduit import Conduit
import Data.LanguageCodes qualified as ISO639
import qualified Data.Text as T import qualified Data.Text as T
import Gargantext.Core (Lang, toISO639Lang) import Gargantext.Core (iso639ToText)
import Gargantext.Core.Text.Corpus.Query as Corpus import Gargantext.Core.Text.Corpus.Query as Corpus
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Protolude import Protolude
...@@ -22,12 +23,12 @@ import Servant.Client (ClientError) ...@@ -22,12 +23,12 @@ import Servant.Client (ClientError)
get :: Text get :: Text
-> Corpus.RawQuery -> Corpus.RawQuery
-> Lang -> Maybe ISO639.ISO639_1
-> Maybe Limit -> Maybe Limit
-> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ())) -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get _email q lang mLimit = do get _email q lang mLimit = do
let limit = getLimit $ fromMaybe 10000 mLimit let limit = getLimit $ fromMaybe 1000 mLimit
let mFilter = (\l -> "language:" <> l) <$> toISO639Lang lang let mFilter = (\l -> "language:" <> iso639ToText l) <$> lang
eRes <- OA.fetchWorksC Nothing mFilter $ Just $ Corpus.getRawQuery q eRes <- OA.fetchWorksC Nothing mFilter $ Just $ Corpus.getRawQuery q
pure $ (\(len, docsC) -> (len, docsC .| takeC limit .| mapC toDoc)) <$> eRes pure $ (\(len, docsC) -> (len, docsC .| takeC limit .| mapC toDoc)) <$> eRes
......
...@@ -236,7 +236,7 @@ toDoc ff d = do ...@@ -236,7 +236,7 @@ toDoc ff d = do
let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
-- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse -- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit dateToParse
let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
, _hd_doi = lookup "doi" d , _hd_doi = lookup "doi" d
......
...@@ -48,10 +48,10 @@ import qualified Data.List as List ...@@ -48,10 +48,10 @@ import qualified Data.List as List
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Parse date to Ints -- | Parse date to Ints
-- TODO add hours, minutes and seconds -- TODO add hours, minutes and seconds
dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day)) dateSplit :: Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing)) dateSplit Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
dateSplit l (Just txt) = do dateSplit (Just txt) = do
utcTime <- parse l txt utcTime <- parse txt
let (y, m, d) = split' utcTime let (y, m, d) = split' utcTime
pure (Just utcTime, (Just y, Just m,Just d)) pure (Just utcTime, (Just y, Just m,Just d))
...@@ -72,15 +72,15 @@ type Day = Int ...@@ -72,15 +72,15 @@ type Day = Int
-- 1900-04-01 19:00:00 UTC -- 1900-04-01 19:00:00 UTC
-- >>> parse EN (pack "April 1 1900") -- >>> parse EN (pack "April 1 1900")
-- 1900-04-01 00:00:00 UTC -- 1900-04-01 00:00:00 UTC
parse :: Lang -> Text -> IO UTCTime parse :: Text -> IO UTCTime
parse lang s = do parse s = do
-- printDebug "Date: " s -- printDebug "Date: " s
let result = dateFlow (DucklingFailure s) let result = dateFlow (DucklingFailure s)
--printDebug "Date': " dateStr' --printDebug "Date': " dateStr'
case result of case result of
DateFlowSuccess ok -> pure ok DateFlowSuccess ok -> pure ok
DateFlowFailure -> (withDebugMode (DebugMode True) DateFlowFailure -> (withDebugMode (DebugMode True)
"[G.C.T.P.T.Date parse]" (lang,s) "[G.C.T.P.T.Date parse]" s
$ getCurrentTime) $ getCurrentTime)
_ -> panic "[G.C.T.C.Parsers.Date] parse: Should not happen" _ -> panic "[G.C.T.C.Parsers.Date] parse: Should not happen"
...@@ -206,4 +206,3 @@ parseDateWithDuckling lang input options = do ...@@ -206,4 +206,3 @@ parseDateWithDuckling lang input options = do
--pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx --pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
-- TODO check/test Options False or True -- TODO check/test Options False or True
analyze input contxt options $ HashSet.fromList [(Seal Time)] analyze input contxt options $ HashSet.fromList [(Seal Time)]
{-| {-|
Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata
<<<<<<< HEAD <<<<<<< HEAD
Description : To query Wikidata Description : To query Wikidata
======= =======
Description : To query Wikidata Description : To query Wikidata
>>>>>>> dev-clustering >>>>>>> dev-clustering
...@@ -68,7 +68,7 @@ wikiPageToDocument m wr = do ...@@ -68,7 +68,7 @@ wikiPageToDocument m wr = do
source = Nothing source = Nothing
abstract = Just $ concat $ take m sections abstract = Just $ concat $ take m sections
(date, (year, month, day)) <- dateSplit EN $ head (date, (year, month, day)) <- dateSplit $ head
$ catMaybes $ catMaybes
[ wr ^. wr_yearStart [ wr ^. wr_yearStart
, wr ^. wr_yearEnd , wr ^. wr_yearEnd
...@@ -130,4 +130,3 @@ wikidataQuery n = List.unlines ...@@ -130,4 +130,3 @@ wikidataQuery n = List.unlines
," }" ," }"
," LIMIT " <> (cs $ show n) ," LIMIT " <> (cs $ show n)
] ]
...@@ -22,11 +22,13 @@ Source: https://en.wikipedia.org/wiki/Part-of-speech_tagging ...@@ -22,11 +22,13 @@ Source: https://en.wikipedia.org/wiki/Part-of-speech_tagging
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-} {-# LANGUAGE TypeOperators #-}
module Gargantext.Core.Text.Terms.Multi.PosTagging module Gargantext.Core.Text.Terms.Multi.PosTagging where
where
import Control.Exception (catch, throwIO)
import Data.Aeson import Data.Aeson
import Data.ByteString.Lazy.Char8 qualified as BSL
import Data.ByteString.Lazy.Internal (ByteString) import Data.ByteString.Lazy.Internal (ByteString)
import Data.Map qualified as Map
import Data.Set (fromList) import Data.Set (fromList)
import Data.Text (Text, splitOn, pack, toLower) import Data.Text (Text, splitOn, pack, toLower)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
...@@ -35,8 +37,6 @@ import Gargantext.Core.Types ...@@ -35,8 +37,6 @@ import Gargantext.Core.Types
import Gargantext.Prelude import Gargantext.Prelude
import Network.HTTP.Simple import Network.HTTP.Simple
import Network.URI (URI(..)) import Network.URI (URI(..))
import qualified Data.ByteString.Lazy.Char8 as BSL
import qualified Data.Map as Map
-- import qualified Gargantext.Utils.SpacyNLP as SpacyNLP -- import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
...@@ -82,7 +82,15 @@ corenlp' uri lang txt = do ...@@ -82,7 +82,15 @@ corenlp' uri lang txt = do
req <- parseRequest $ req <- parseRequest $
"POST " <> show (uri { uriQuery = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList properties) }) "POST " <> show (uri { uriQuery = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList properties) })
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq . -- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
httpJSON $ setRequestBodyLBS (cs txt) req -- printDebug "[corenlp] sending body" $ (cs txt :: ByteString)
catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e ->
case e of
JSONParseException _req res _err -> do
let body = getResponseBody res
printDebug "[corenlp'] request text" (cs txt :: ByteString)
printDebug "[corenlp'] response body (error)" body
throwIO e
JSONConversionException _req _res _err -> throwIO e
where where
properties_ :: [(Text, Text)] properties_ :: [(Text, Text)]
properties_ = case lang of properties_ = case lang of
......
...@@ -60,7 +60,7 @@ extra-deps: ...@@ -60,7 +60,7 @@ extra-deps:
- git: https://gitlab.iscpif.fr/gargantext/crawlers/arxiv-api.git - git: https://gitlab.iscpif.fr/gargantext/crawlers/arxiv-api.git
commit: 2d7e5753cbbce248b860b571a0e9885415c846f7 commit: 2d7e5753cbbce248b860b571a0e9885415c846f7
- git: https://gitlab.iscpif.fr/gargantext/crawlers/openalex.git - git: https://gitlab.iscpif.fr/gargantext/crawlers/openalex.git
commit: dab07cb89e8ab8eaaff8619f5e21d944d9c526ab commit: 1cf872fb3bd0e3e44af31247833c4b6bb7d0dca5
# NP libs # NP libs
- git: https://github.com/alpmestan/servant-job.git - git: https://github.com/alpmestan/servant-job.git
commit: b4182487cfe479777c11ca19f3c0d47840b376f6 commit: b4182487cfe479777c11ca19f3c0d47840b376f6
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment