Commit 46d254d9 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'dev-merge' into dev

parents 0322ffc8 bdd5deec
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# https://vadosware.io/post/zero-to-continuous-integrated-testing-a-haskell-project-with-gitlab/ # https://vadosware.io/post/zero-to-continuous-integrated-testing-a-haskell-project-with-gitlab/
# #
# #
image: cgenie/stack-build:lts-17.13-garg image: cgenie/stack-build:lts-18.18-garg
variables: variables:
STACK_ROOT: "${CI_PROJECT_DIR}/.stack-root" STACK_ROOT: "${CI_PROJECT_DIR}/.stack-root"
......
...@@ -246,6 +246,7 @@ library: ...@@ -246,6 +246,7 @@ library:
- timezone-series - timezone-series
- transformers - transformers
- transformers-base - transformers-base
- tuple
- unordered-containers - unordered-containers
- utf8-string - utf8-string
- uuid - uuid
......
...@@ -200,7 +200,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q ...@@ -200,7 +200,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
Just Web -> do Just Web -> do
printDebug "[addToCorpusWithQuery] processing web request" datafield printDebug "[addToCorpusWithQuery] processing web request" datafield
_ <- triggerSearxSearch cid q l _ <- triggerSearxSearch user cid q l logStatus
pure JobLog { _scst_succeeded = Just 3 pure JobLog { _scst_succeeded = Just 3
, _scst_failed = Just 0 , _scst_failed = Just 0
......
...@@ -7,32 +7,60 @@ module Gargantext.API.Node.Corpus.Searx where ...@@ -7,32 +7,60 @@ module Gargantext.API.Node.Corpus.Searx where
import Control.Lens (view) import Control.Lens (view)
import qualified Data.Aeson as Aeson import qualified Data.Aeson as Aeson
import Data.Aeson.TH (deriveJSON) import Data.Aeson.TH (deriveJSON)
import Data.Either (Either(..))
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as T import qualified Data.Text as T
import Data.Time.Calendar (Day, toGregorian)
import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
import Data.Tuple.Select (sel1, sel2, sel3)
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Network.HTTP.Client import Network.HTTP.Client
import Network.HTTP.Client.TLS import Network.HTTP.Client.TLS
import qualified Prelude as Prelude import qualified Prelude as Prelude
import Protolude (encodeUtf8, Text, Either) import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Prelude.Config import Gargantext.Prelude.Config
import Gargantext.Core (Lang(..)) import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
--import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Job (jobLogSuccess)
import Gargantext.Core (Lang(..), PosTagAlgo(..))
import qualified Gargantext.Core.Text.Corpus.API as API import qualified Gargantext.Core.Text.Corpus.API as API
import Gargantext.Core.Text.List (buildNgramsLists)
import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.Terms (TermType(..))
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Utils.Prefix (unPrefix) import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Database.Action.Flow (insertMasterDocs) --, DataText(..))
import Gargantext.Database.Action.Flow.List (flowList_DbRepo)
import Gargantext.Database.Action.Flow.Types (FlowCmdM) import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Admin.Types.Node (CorpusId) import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Config (userMaster)
import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Node (CorpusId, ListId)
import Gargantext.Database.Prelude (hasConfig) import Gargantext.Database.Prelude (hasConfig)
import Gargantext.Database.Query.Table.Node (defaultListMaybe, getOrMkList)
import Gargantext.Database.Query.Tree.Root (getOrMk_RootWithCorpus)
import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
langToSearx :: Lang -> Text
langToSearx EN = "en-US"
langToSearx FR = "fr-FR"
langToSearx All = "en-US"
data SearxResult = SearxResult data SearxResult = SearxResult
{ _sr_url :: Text { _sr_url :: Text
, _sr_title :: Text , _sr_title :: Text
, _sr_content :: Maybe Text , _sr_content :: Maybe Text
, _sr_engine :: Text , _sr_engine :: Text
, _sr_score :: Double , _sr_score :: Double
, _sr_category :: Text , _sr_category :: Text
, _sr_pretty_url :: Text } , _sr_pretty_url :: Text
, _sr_publishedDate :: Text -- "Nov 19, 2021"
, _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
}
deriving (Show, Eq, Generic) deriving (Show, Eq, Generic)
-- , _sr_parsed_url -- , _sr_parsed_url
-- , _sr_engines -- , _sr_engines
...@@ -76,34 +104,117 @@ fetchSearxPage (FetchSearxParams { _fsp_language ...@@ -76,34 +104,117 @@ fetchSearxPage (FetchSearxParams { _fsp_language
, ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976 , ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
, ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno) , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
--, ("time_range", "None") --, ("time_range", "None")
, ("language", encodeUtf8 $ T.pack $ show _fsp_language) , ("language", encodeUtf8 $ langToSearx _fsp_language)
, ("format", "json") , ("format", "json")
] req ] req
res <- httpLbs request _fsp_manager res <- httpLbs request _fsp_manager
let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse) let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
pure dec pure dec
insertSearxResponse :: (MonadBase IO m, FlowCmdM env err m)
=> User
-> CorpusId
-> ListId
-> Lang
-> Either Prelude.String SearxResponse
-> m ()
insertSearxResponse _ _ _ _ (Left _) = pure ()
insertSearxResponse user cId listId l (Right (SearxResponse { _srs_results })) = do
let docs = hyperdataDocumentFromSearxResult l <$> _srs_results
--printDebug "[triggerSearxSearch] docs" docs
-- docs :: [Either Text HyperdataDocument]
let docs' = catMaybes $ rightToMaybe <$> docs
Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
printDebug "[triggerSearxSearch] doc time" $
"[title] " <> (show _hd_title) <>
" :: [publication_year] " <> (show _hd_publication_year) <>
" :: [publication_date] " <> (show _hd_publication_date)
) docs'
--_ <- flowDataText user (DataNew [docs']) (Multi l) cId Nothing logStatus
let mCorpus = Nothing :: Maybe HyperdataCorpus
ids <- insertMasterDocs mCorpus (Multi l) docs'
_ <- Doc.add cId ids
(_masterUserId, _masterRootId, masterCorpusId)
<- getOrMk_RootWithCorpus (UserName userMaster) (Left "") mCorpus
let gp = GroupWithPosTag l CoreNLP HashMap.empty
ngs <- buildNgramsLists user cId masterCorpusId Nothing gp
_userListId <- flowList_DbRepo listId ngs
pure ()
-- TODO Make an async task out of this?
triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m) triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m)
=> CorpusId => User
-> CorpusId
-> API.Query -> API.Query
-> Lang -> Lang
-> m () -> (JobLog -> m ())
-> m JobLog
triggerSearxSearch cid q l = do triggerSearxSearch user cId q l logStatus = do
printDebug "[triggerSearxSearch] cid" cid let numPages = 100
let jobLog = JobLog { _scst_succeeded = Just 0
, _scst_failed = Just 0
, _scst_remaining = Just numPages
, _scst_events = Just []
}
logStatus jobLog
printDebug "[triggerSearxSearch] cId" cId
printDebug "[triggerSearxSearch] q" q printDebug "[triggerSearxSearch] q" q
printDebug "[triggerSearxSearch] l" l printDebug "[triggerSearxSearch] l" l
cfg <- view hasConfig cfg <- view hasConfig
uId <- getUserId user
let surl = _gc_frame_searx_url cfg let surl = _gc_frame_searx_url cfg
printDebug "[triggerSearxSearch] surl" surl printDebug "[triggerSearxSearch] surl" surl
mListId <- defaultListMaybe cId
listId <- case mListId of
Nothing -> do
listId <- getOrMkList cId uId
pure listId
Just listId -> pure listId
printDebug "[triggerSearxSearch] listId" listId
manager <- liftBase $ newManager tlsManagerSettings manager <- liftBase $ newManager tlsManagerSettings
res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l _ <- mapM (\page -> do
, _fsp_manager = manager res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
, _fsp_pageno = 1 , _fsp_manager = manager
, _fsp_query = q , _fsp_pageno = page
, _fsp_url = surl } , _fsp_query = q
, _fsp_url = surl }
printDebug "[triggerSearxSearch] res" res
insertSearxResponse user cId listId l res
logStatus $ JobLog { _scst_succeeded = Just page
, _scst_failed = Just 0
, _scst_remaining = Just (numPages - page)
, _scst_events = Just [] }
) [1..numPages]
--printDebug "[triggerSearxSearch] res" res
pure $ jobLogSuccess jobLog
hyperdataDocumentFromSearxResult :: Lang -> SearxResult -> Either T.Text HyperdataDocument
hyperdataDocumentFromSearxResult l (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
let mGregorian = toGregorian <$> mDate
Right HyperdataDocument { _hd_bdd = Just "Searx"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just _sr_title
, _hd_authors = Nothing
, _hd_institutes = Nothing
, _hd_source = Just _sr_engine
, _hd_abstract = _sr_content
, _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
, _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
, _hd_publication_month = sel2 <$> mGregorian
, _hd_publication_day = sel3 <$> mGregorian
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ T.pack $ show l }
pure ()
...@@ -71,9 +71,9 @@ import Gargantext.Core.Ext.IMT (toSchoolName) ...@@ -71,9 +71,9 @@ import Gargantext.Core.Ext.IMT (toSchoolName)
import Gargantext.Core.Ext.IMTUser (readFile_Annuaire) import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
import Gargantext.Core.Flow.Types import Gargantext.Core.Flow.Types
import Gargantext.Core.Text import Gargantext.Core.Text
import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat) import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
import Gargantext.Core.Text.List (buildNgramsLists) import Gargantext.Core.Text.List (buildNgramsLists)
import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.List.Social (FlowSocialListWith) import Gargantext.Core.Text.List.Social (FlowSocialListWith)
import Gargantext.Core.Text.Terms import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt) import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
......
...@@ -279,7 +279,10 @@ instance ToNode HyperdataDocument where ...@@ -279,7 +279,10 @@ instance ToNode HyperdataDocument where
where where
n = maybe "No Title" (DT.take 255) (_hd_title h) n = maybe "No Title" (DT.take 255) (_hd_title h)
date = jour y m d date = jour y m d
y = maybe 0 fromIntegral $ _hd_publication_year h -- NOTE: There is no year '0' in postgres, there is year 1 AD and beofre that year 1 BC:
-- select '0001-01-01'::date, '0001-01-01'::date - '1 day'::interval;
-- 0001-01-01 0001-12-31 00:00:00 BC
y = maybe 1 fromIntegral $ _hd_publication_year h
m = fromMaybe 1 $ _hd_publication_month h m = fromMaybe 1 $ _hd_publication_month h
d = fromMaybe 1 $ _hd_publication_day h d = fromMaybe 1 $ _hd_publication_day h
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment