Searx.hs 7.44 KB
Newer Older
1 2 3 4
{-# LANGUAGE TemplateHaskell #-}

module Gargantext.API.Node.Corpus.Searx where

Alexandre Delanoë's avatar
Alexandre Delanoë committed
5 6


7 8 9
import Control.Lens (view)
import qualified Data.Aeson as Aeson
import Data.Aeson.TH (deriveJSON)
10
import Data.Either (Either(..))
11
import qualified Data.Text as T
12
import Data.Time.Calendar (Day, toGregorian)
13
import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
14
import Data.Tuple.Select (sel1, sel2, sel3)
15 16 17 18
import GHC.Generics (Generic)
import Network.HTTP.Client
import Network.HTTP.Client.TLS

19
import qualified Prelude as Prelude
20
import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
21 22 23
import Gargantext.Prelude
import Gargantext.Prelude.Config

24 25 26
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
--import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Job (jobLogSuccess, jobLogFailTotalWithMessage)
27 28
import Gargantext.Core (Lang(..))
import qualified Gargantext.Core.Text.Corpus.API as API
29 30
import Gargantext.Core.Text.Terms (TermType(..))
import Gargantext.Core.Types.Individu (User(..))
31
import Gargantext.Core.Utils.Prefix (unPrefix)
32
import Gargantext.Database.Action.Flow (flowDataText, DataText(..))
Alexandre Delanoë's avatar
Alexandre Delanoë committed
33
import Gargantext.Database.Action.Flow.Types (FlowCmdM)
34 35
import Gargantext.Database.Admin.Config ()
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
36 37
import Gargantext.Database.Admin.Types.Node (CorpusId)
import Gargantext.Database.Prelude (hasConfig)
38
import Gargantext.Database.Query.Table.Node (defaultListMaybe)
39 40


41 42 43 44 45
langToSearx :: Lang -> Text
langToSearx EN = "en-US"
langToSearx FR = "fr-FR"
langToSearx All = "en-US"

46
data SearxResult = SearxResult
47 48 49 50 51 52 53 54 55 56
  { _sr_url           :: Text
  , _sr_title         :: Text
  , _sr_content       :: Maybe Text
  , _sr_engine        :: Text
  , _sr_score         :: Double
  , _sr_category      :: Text
  , _sr_pretty_url    :: Text
  , _sr_publishedDate :: Text   -- "Nov 19, 2021"
  , _sr_pubdate       :: Text  -- "2021-11-19 02:12:00+0000"
  }
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
  deriving (Show, Eq, Generic)
--  , _sr_parsed_url
--  , _sr_engines
--  , _sr_positions

$(deriveJSON (unPrefix "_sr_") ''SearxResult)

data SearxResponse = SearxResponse
  { _srs_query                :: Text
  , _srs_number_of_results    :: Int
  , _srs_results              :: [SearxResult] }
  deriving (Show, Eq, Generic)
-- , _srs_answers
-- , _srs_corrections
-- , _srs_infoboxes
--  , _srs_suggestions          :: [Text]
--  , _srs_unresponsive_engines :: [Text] }

$(deriveJSON (unPrefix "_srs_") ''SearxResponse)

77
data FetchSearxParams = FetchSearxParams
78 79
  { _fsp_language :: Lang
  , _fsp_manager :: Manager
80 81 82 83 84 85
  , _fsp_pageno  :: Int
  , _fsp_query   :: Text
  , _fsp_url     :: Text
  }

fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
86 87
fetchSearxPage (FetchSearxParams { _fsp_language
                                 , _fsp_manager
88 89 90 91 92 93 94 95 96
                                 , _fsp_pageno
                                 , _fsp_query
                                 , _fsp_url }) = do
  -- searx search API:
  -- https://searx.github.io/searx/dev/search_api.html?highlight=json
  req <- parseRequest $ T.unpack _fsp_url
  let request = urlEncodedBody
        [ --("category_general", "1")
          ("q", encodeUtf8 _fsp_query)
97
        , ("categories", "news")  -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
98 99
        , ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
          --, ("time_range", "None")
100
        , ("language", encodeUtf8 $ langToSearx _fsp_language)
101 102 103 104 105 106
        , ("format", "json")
        ] req
  res <- httpLbs request _fsp_manager
  let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
  pure dec

107
-- TODO Make an async task out of this?
108
triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m)
109 110
            => User
            -> CorpusId
111 112
            -> API.Query
            -> Lang
113 114 115 116 117 118 119 120 121 122
            -> (JobLog -> m ())
            -> m JobLog
triggerSearxSearch user cId q l logStatus = do
  let jobLog = JobLog { _scst_succeeded = Just 1
                      , _scst_failed    = Just 0
                      , _scst_remaining = Just 1
                      , _scst_events    = Just []
                      }
  logStatus jobLog

123
  printDebug "[triggerSearxSearch] cId" cId
124 125 126 127 128
  printDebug "[triggerSearxSearch] q" q
  printDebug "[triggerSearxSearch] l" l
  cfg <- view hasConfig
  let surl = _gc_frame_searx_url cfg
  printDebug "[triggerSearxSearch] surl" surl
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
  mListId <- defaultListMaybe cId
  case mListId of
    Nothing -> do
      let failedJobLog = jobLogFailTotalWithMessage "[triggerSearxSearch] no list id" jobLog
      logStatus failedJobLog
      pure failedJobLog
    Just listId -> do
      printDebug "[triggerSearxSearch] listId" listId

      manager <- liftBase $ newManager tlsManagerSettings
      res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
                                                          , _fsp_manager = manager
                                                          , _fsp_pageno = 1
                                                          , _fsp_query = q
                                                          , _fsp_url = surl }
144
   
145
      --printDebug "[triggerSearxSearch] res" res
146 147 148 149 150

      case res of
        Left _ -> pure ()
        Right (SearxResponse { _srs_results }) -> do
          let docs = hyperdataDocumentFromSearxResult <$> _srs_results
151
          --printDebug "[triggerSearxSearch] docs" docs
152 153
          -- docs :: [Either Text HyperdataDocument]
          let docs' = catMaybes $ rightToMaybe <$> docs
154
          Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
155 156 157 158
            printDebug "[triggerSearxSearch] doc time" $
              "[title] " <> (show _hd_title) <>
              " :: [publication_year] " <> (show _hd_publication_year) <>
              " :: [publication_date] " <> (show _hd_publication_date)
159
            ) docs'
160 161 162 163
          _ <- flowDataText user (DataNew [docs']) (Multi EN) cId Nothing logStatus
          pure ()

      pure $ jobLogSuccess jobLog
164 165 166

hyperdataDocumentFromSearxResult :: SearxResult -> Either T.Text HyperdataDocument
hyperdataDocumentFromSearxResult (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
167
  let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
168 169 170 171 172 173 174
  let mGregorian = toGregorian <$> mDate
  Right HyperdataDocument { _hd_bdd = Just "Searx"
                          , _hd_doi = Nothing
                          , _hd_url = Nothing
                          , _hd_uniqId = Nothing
                          , _hd_uniqIdBdd = Nothing
                          , _hd_page = Nothing
175
                          , _hd_title = Just $ ("[" <> _sr_pubdate <> "] ") <> _sr_title
176 177 178 179
                          , _hd_authors = Nothing
                          , _hd_institutes = Nothing
                          , _hd_source = Just _sr_engine
                          , _hd_abstract = _sr_content
180
                          , _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
181 182 183 184 185 186 187 188
                          , _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
                          , _hd_publication_month = sel2 <$> mGregorian
                          , _hd_publication_day = sel3 <$> mGregorian
                          , _hd_publication_hour = Nothing
                          , _hd_publication_minute = Nothing
                          , _hd_publication_second = Nothing
                          , _hd_language_iso2 = Just $ T.pack $ show EN }