Commit 6241198d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] OpenAlex merge

parents 00df9697 b2f9e554
......@@ -10,7 +10,8 @@ STORE_DIR="${1:-$DEFAULT_STORE}"
# changes, you have to make sure to update the `expected_cabal_projet_hash` with the
# `sha256sum` result calculated on the `cabal.project`. This ensures the `cabal.project`
# stays deterministic so that CI cache can kick in.
expected_cabal_project_hash="2754bf61cc7a2aa7b29345ffe34dc1e90a06426f00fc39da9f793cd828be4e15"
#expected_cabal_project_hash="2754bf61cc7a2aa7b29345ffe34dc1e90a06426f00fc39da9f793cd828be4e15"
expected_cabal_project_hash="5e989e199765ba2dd476208a66e96495ade69eb7cb14c0a448dfebd5748c9b39"
cabal --store-dir=$STORE_DIR v2-update 'hackage.haskell.org,2023-06-24T21:28:46Z'
......
......@@ -99,7 +99,7 @@ source-repository-package
source-repository-package
type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/openalex.git
tag: dab07cb89e8ab8eaaff8619f5e21d944d9c526ab
tag: 1cf872fb3bd0e3e44af31247833c4b6bb7d0dca5
source-repository-package
type: git
......
......@@ -964,3 +964,4 @@ test-suite garg-test
, unordered-containers ^>= 0.2.16.0
, validity ^>= 0.11.0.1
default-language: Haskell2010
......@@ -93,7 +93,7 @@ documentUpload nId doc = do
Just c -> c
Nothing -> panic $ T.pack $ "[G.A.N.DU] Node has no corpus parent: " <> show nId
(theFullDate, (year, month, day)) <- liftBase $ dateSplit EN
(theFullDate, (year, month, day)) <- liftBase $ dateSplit
$ Just
$ view du_date doc
......
......@@ -17,6 +17,7 @@ module Gargantext.Core
import Data.Aeson
import Data.Either(Either(Left))
import Data.Hashable (Hashable)
import Data.LanguageCodes qualified as ISO639
import Data.Maybe (fromMaybe)
import Data.Morpheus.Types (GQLType)
import Data.Swagger
......@@ -45,6 +46,7 @@ import qualified Data.Map as Map
-- | All languages supported
-- NOTE: Use international country codes
-- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
-- TODO This should be deprecated in favor of iso-639 library
data Lang = All
| DE
| EL
......@@ -93,6 +95,28 @@ instance Hashable Lang
instance Arbitrary Lang where
arbitrary = arbitraryBoundedEnum
toISO639 :: Lang -> Maybe ISO639.ISO639_1
toISO639 DE = Just ISO639.DE
toISO639 EL = Just ISO639.EL
toISO639 EN = Just ISO639.EN
toISO639 ES = Just ISO639.ES
toISO639 FR = Just ISO639.FR
toISO639 IT = Just ISO639.IT
toISO639 PL = Just ISO639.PL
toISO639 PT = Just ISO639.PT
toISO639 RU = Just ISO639.RU
toISO639 UK = Just ISO639.UK
toISO639 ZH = Just ISO639.ZH
toISO639 All = Nothing
toISO639EN :: Lang -> ISO639.ISO639_1
toISO639EN l = fromMaybe ISO639.EN $ toISO639 l
iso639ToText :: ISO639.ISO639_1 -> Text
iso639ToText la = pack [a, b]
where
(a, b) = ISO639.toChars la
-- | https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
toISO639Lang :: Lang -> Maybe Text
toISO639Lang All = Nothing
......
......@@ -24,7 +24,7 @@ import Data.Either (Either(..))
import Data.Maybe
import qualified Data.Text as T
import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs)
import Gargantext.Core (Lang(..))
import Gargantext.Core (Lang(..), toISO639)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv
......@@ -57,13 +57,13 @@ get externalAPI la q mPubmedAPIKey limit = do
Left err -> pure $ Left $ InvalidInputQuery q (T.pack err)
Right corpusQuery -> case externalAPI of
OpenAlex -> first ExternalAPIError <$>
OpenAlex.get (fromMaybe "" Nothing {- email -}) q la limit
OpenAlex.get (fromMaybe "" Nothing {- email -}) q (toISO639 la) limit
PubMed -> first ExternalAPIError <$>
PUBMED.get (fromMaybe "" mPubmedAPIKey) corpusQuery limit
--docs <- PUBMED.get q default_limit -- EN only by default
--pure (Just $ fromIntegral $ length docs, yieldMany docs)
Arxiv -> Right <$> Arxiv.get la corpusQuery limit
HAL -> first ExternalAPIError <$> HAL.getC la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
HAL -> first ExternalAPIError <$> HAL.getC (toISO639 la) (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
IsTex -> do docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
Isidore -> do docs <- ISIDORE.get la (Corpus.getLimit <$> limit) (Just $ Corpus.getRawQuery q) Nothing
......
......@@ -18,7 +18,7 @@ import Data.LanguageCodes qualified as ISO639
import Data.Map.Strict qualified as Map
import Data.Maybe
import Data.Text (Text, pack, intercalate)
import Gargantext.Core (Lang(..))
-- import Gargantext.Core (Lang(..), toISO639Lang)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Defaults qualified as Defaults
......@@ -28,40 +28,26 @@ import HAL.Client qualified as HAL
import HAL.Doc.Corpus qualified as HAL
import Servant.Client (ClientError)
toLang :: Lang -> Maybe ISO639.ISO639_1
toLang DE = Just ISO639.DE
toLang EL = Just ISO639.EL
toLang EN = Just ISO639.EN
toLang ES = Just ISO639.ES
toLang FR = Just ISO639.FR
toLang IT = Just ISO639.IT
toLang PL = Just ISO639.PL
toLang PT = Just ISO639.PT
toLang RU = Just ISO639.RU
toLang UK = Just ISO639.UK
toLang ZH = Just ISO639.ZH
toLang All = Nothing
get :: Lang -> Text -> Maybe Int -> IO [HyperdataDocument]
get :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO [HyperdataDocument]
get la q ml = do
eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) (toLang la)
eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) la
either (panic . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) eDocs
getC :: Lang -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do
eRes <- HAL.getMetadataWithC [q] (Just 0) (fromIntegral <$> ml) (toLang la)
eRes <- HAL.getMetadataWithC [q] (Just 0) (fromIntegral <$> ml) la
pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes
-- case eRes of
-- Left err -> panic $ pack $ show err
-- Right (len, docsC) -> pure (len, docsC .| mapMC (toDoc' la))
toDoc' :: Lang -> HAL.Corpus -> IO HyperdataDocument
toDoc' :: Maybe ISO639.ISO639_1 -> HAL.Corpus -> IO HyperdataDocument
toDoc' la (HAL.Corpus { .. }) = do
-- printDebug "[toDoc corpus] h" h
(utctime, (pub_year, pub_month, pub_day)) <-
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) Just _corpus_date)
Date.dateSplit (maybe (Just $ pack $ show Defaults.year) Just _corpus_date)
let abstractDefault = intercalate " " _corpus_abstract
let abstract = case toLang la of
let abstract = case la of
Nothing -> abstractDefault
Just l -> fromMaybe abstractDefault (intercalate " " <$> Map.lookup l _corpus_abstract_lang_map)
pure HyperdataDocument { _hd_bdd = Just "Hal"
......
......@@ -35,13 +35,13 @@ get la l q a = do
let
printErr (DecodeFailure e _) = panic e
printErr e = panic (cs $ show e)
toIsidoreDocs :: Reply -> [IsidoreDoc]
toIsidoreDocs (ReplyOnly r) = [r]
toIsidoreDocs (Replies rs) = rs
iDocs <- either printErr _content <$> Isidore.get l q a
hDocs <- mapM (\d -> isidoreToDoc la d) (toIsidoreDocs iDocs)
pure hDocs
......@@ -58,7 +58,7 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
author :: Author -> Text
author (Author fn ln) = (_name fn) <> ", " <> (_name ln)
author (Authors aus) = Text.intercalate ". " $ map author aus
creator2text :: Creator -> Text
creator2text (Creator au) = author au
creator2text (Creators aus') = Text.intercalate ". " $ map author aus'
......@@ -67,9 +67,9 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
langText (LangText _l t1) = t1
langText (OnlyText t2 ) = t2
langText (ArrayText ts ) = Text.intercalate " " $ map langText ts
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit l (maybe (Just $ Text.pack $ show Defaults.year) (Just) d)
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit (maybe (Just $ Text.pack $ show Defaults.year) (Just) d)
pure HyperdataDocument
{ _hd_bdd = Just "Isidore"
, _hd_doi = Nothing
......@@ -91,5 +91,3 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (Text.pack . show) l
}
......@@ -82,7 +82,7 @@ toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument
toDoc la (ISTEX.Document i t a ab d s) = do
--printDebug "ISTEX date" d
(utctime, (pub_year, pub_month, pub_day)) <-
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d)
Date.dateSplit (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d)
--printDebug "toDoc Istex" (utctime, (pub_year, pub_month, pub_day))
pure $ HyperdataDocument { _hd_bdd = Just "Istex"
, _hd_doi = Just i
......
......@@ -10,8 +10,9 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API.OpenAlex where
import Conduit
import Data.LanguageCodes qualified as ISO639
import qualified Data.Text as T
import Gargantext.Core (Lang, toISO639Lang)
import Gargantext.Core (iso639ToText)
import Gargantext.Core.Text.Corpus.Query as Corpus
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Protolude
......@@ -22,12 +23,12 @@ import Servant.Client (ClientError)
get :: Text
-> Corpus.RawQuery
-> Lang
-> Maybe ISO639.ISO639_1
-> Maybe Limit
-> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get _email q lang mLimit = do
let limit = getLimit $ fromMaybe 10000 mLimit
let mFilter = (\l -> "language:" <> l) <$> toISO639Lang lang
let limit = getLimit $ fromMaybe 1000 mLimit
let mFilter = (\l -> "language:" <> iso639ToText l) <$> lang
eRes <- OA.fetchWorksC Nothing mFilter $ Just $ Corpus.getRawQuery q
pure $ (\(len, docsC) -> (len, docsC .| takeC limit .| mapC toDoc)) <$> eRes
......
......@@ -236,7 +236,7 @@ toDoc ff d = do
let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
-- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit dateToParse
let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
, _hd_doi = lookup "doi" d
......
......@@ -48,10 +48,10 @@ import qualified Data.List as List
------------------------------------------------------------------------
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
dateSplit l (Just txt) = do
utcTime <- parse l txt
dateSplit :: Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
dateSplit Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
dateSplit (Just txt) = do
utcTime <- parse txt
let (y, m, d) = split' utcTime
pure (Just utcTime, (Just y, Just m,Just d))
......@@ -72,15 +72,15 @@ type Day = Int
-- 1900-04-01 19:00:00 UTC
-- >>> parse EN (pack "April 1 1900")
-- 1900-04-01 00:00:00 UTC
parse :: Lang -> Text -> IO UTCTime
parse lang s = do
parse :: Text -> IO UTCTime
parse s = do
-- printDebug "Date: " s
let result = dateFlow (DucklingFailure s)
--printDebug "Date': " dateStr'
case result of
DateFlowSuccess ok -> pure ok
DateFlowFailure -> (withDebugMode (DebugMode True)
"[G.C.T.P.T.Date parse]" (lang,s)
"[G.C.T.P.T.Date parse]" s
$ getCurrentTime)
_ -> panic "[G.C.T.C.Parsers.Date] parse: Should not happen"
......@@ -206,4 +206,3 @@ parseDateWithDuckling lang input options = do
--pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
-- TODO check/test Options False or True
analyze input contxt options $ HashSet.fromList [(Seal Time)]
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata
<<<<<<< HEAD
Description : To query Wikidata
Description : To query Wikidata
=======
Description : To query Wikidata
>>>>>>> dev-clustering
......@@ -68,7 +68,7 @@ wikiPageToDocument m wr = do
source = Nothing
abstract = Just $ concat $ take m sections
(date, (year, month, day)) <- dateSplit EN $ head
(date, (year, month, day)) <- dateSplit $ head
$ catMaybes
[ wr ^. wr_yearStart
, wr ^. wr_yearEnd
......@@ -130,4 +130,3 @@ wikidataQuery n = List.unlines
," }"
," LIMIT " <> (cs $ show n)
]
......@@ -22,11 +22,13 @@ Source: https://en.wikipedia.org/wiki/Part-of-speech_tagging
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.Core.Text.Terms.Multi.PosTagging
where
module Gargantext.Core.Text.Terms.Multi.PosTagging where
import Control.Exception (catch, throwIO)
import Data.Aeson
import Data.ByteString.Lazy.Char8 qualified as BSL
import Data.ByteString.Lazy.Internal (ByteString)
import Data.Map qualified as Map
import Data.Set (fromList)
import Data.Text (Text, splitOn, pack, toLower)
import Gargantext.Core (Lang(..))
......@@ -35,8 +37,6 @@ import Gargantext.Core.Types
import Gargantext.Prelude
import Network.HTTP.Simple
import Network.URI (URI(..))
import qualified Data.ByteString.Lazy.Char8 as BSL
import qualified Data.Map as Map
-- import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
......@@ -82,7 +82,15 @@ corenlp' uri lang txt = do
req <- parseRequest $
"POST " <> show (uri { uriQuery = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList properties) })
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
httpJSON $ setRequestBodyLBS (cs txt) req
-- printDebug "[corenlp] sending body" $ (cs txt :: ByteString)
catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e ->
case e of
JSONParseException _req res _err -> do
let body = getResponseBody res
printDebug "[corenlp'] request text" (cs txt :: ByteString)
printDebug "[corenlp'] response body (error)" body
throwIO e
JSONConversionException _req _res _err -> throwIO e
where
properties_ :: [(Text, Text)]
properties_ = case lang of
......
......@@ -60,7 +60,7 @@ extra-deps:
- git: https://gitlab.iscpif.fr/gargantext/crawlers/arxiv-api.git
commit: 2d7e5753cbbce248b860b571a0e9885415c846f7
- git: https://gitlab.iscpif.fr/gargantext/crawlers/openalex.git
commit: dab07cb89e8ab8eaaff8619f5e21d944d9c526ab
commit: 1cf872fb3bd0e3e44af31247833c4b6bb7d0dca5
# NP libs
- git: https://github.com/alpmestan/servant-job.git
commit: b4182487cfe479777c11ca19f3c0d47840b376f6
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment