Merge branch 'dev' into 'master'

Basic functionnality and README Documentation See merge request !4

Merge branch 'dev' into 'master'
Basic functionnality and README Documentation See merge request !4
234d84ce · delanoe · 07d1892d · a03304f5 · 234d84ce · 234d84ce
Commit 234d84ce authored May 28, 2020 by delanoe
12 changed files
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "inline-js"]
+	path = inline-js
+	url = https://github.com/tweag/inline-js
--- a/ChangeLog.md
+++ b/ChangeLog.md
+# Changelog for searx
+
+## Unreleased changes
--- a/LICENSE
+++ b/LICENSE
+Copyright Mudada (c) 2019
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+    * Neither the name of Mudada nor the names of other
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
-# searx
+# SEARX Crawler

-[searx](https://en.wikipedia.org/wiki/Searx) is a privacy-respecting, hackable metasearch engine.
+## Base website
+https://searx.me/
+## Based on
+https://gitlab.iscpif.fr/smurail/gargantext-light/blob/simon-gargantext-light/gargantext/scrapers/searx.py

-## Main function to get main results through metadata
+## Usage
+### Entry function
+The basic entry point of this crawler is the function `SEARX.` `getMetadataWith`.

-getMetadataWith "artificial intelligence" (Just 10)
+This function take a `Text` representing the query you want to run on searx
+and a `Int` representing the maximum number of pages you want to get.
+### Return Type
+The return type is pretty simple, it's **either** a `ClientError` or a `ISTEX.` `Client.Documents`.
+
+`Documents` represent a collection of `ISTEX.` `Client.Document` and the number of `Document` returned.
+
+`Document` is a simple type that contain nearly every informations we need (id,title,abstract,publicationDate,sources).
+
+**authors** is not present since it's not contained in the searx database.
+### Exemple
+Here is a basic main using the entry point of the crawler and printing the 5 first results.
+```hs
+{-# LANGUAGE OverloadedStrings #-}
+
+module Main where
+
+import SEARX
+
+main :: IO ()
+main = do
+  res <- getMetadataWith "ia" 300
+  case res of
+    (Left err) -> print err
+    (Right r) -> print $ take 5 $ _documents_hits r
+
+```
+ 
--- a/Setup.hs
+++ b/Setup.hs
+import Distribution.Simple
+main = defaultMain
--- a/app/Main.hs
+++ b/app/Main.hs
+{-# LANGUAGE OverloadedStrings #-}
+
+module Main where
+
+import SEARX
+import SEARX.Client
+import System.Directory
+import Text.HTML.TagSoup
+import Data.Maybe
+import qualified Data.Text as T
+
+emptyArticle :: Article
+emptyArticle = Article
+  {
+    title = Nothing
+  ,byline = Nothing
+  ,dir = Nothing
+  ,content = Nothing
+  ,textContent = Nothing
+  ,SEARX.length = Nothing
+  ,excerpt = Nothing
+  ,siteName = Nothing
+  }
+
+type Depth = Int
+type Limit = Int
+type Query = T.Text
+
+searxSearch :: Query -> Limit -> Depth -> IO [Maybe Article]
+searxSearch q l d = do
+  res <- getMetadataWith q l
+  case res of
+    (Left err) -> return []
+    (Right r) -> do
+      let urls = take l $ _document_id <$> (_documents_hits r)
+      parseWebsiteReq d urls
+
+parseWebsiteReqWithFp :: FilePath -> Depth -> [T.Text] -> IO [Maybe Article]
+parseWebsiteReqWithFp fp d urls
+  | d <= 0 = do
+      art <- parseWebsite'
+      return art
+  | otherwise = do
+      articles <- parseWebsite'
+      d <-
+        parseWebsiteReqWithFp fp
+        (d - 1)
+        (getUrlsFromWebsite articles)
+      return $ d <> articles
+  where parseWebsite' =
+          sequence $ parseWebsite fp <$> urls
+
+parseWebsiteReq :: Depth -> [T.Text] -> IO [Maybe Article]
+parseWebsiteReq d urls = do
+  fp <- setUpDirectory
+  articles <- parseWebsiteReqWithFp fp d urls
+  removeDirectoryRecursive $ fp <> "/readability"
+  return articles
+
+getUrlsFromWebsite :: [Maybe Article] -> [T.Text]
+getUrlsFromWebsite articles =
+  (filter (/= ""))
+  $ (fromAttrib "href")
+  <$> (filter isTagOpen
+        $ concat $ parseTags
+        <$> ((fromMaybe ""
+               . content
+               . fromMaybe emptyArticle)
+              <$> articles))
+-- searxSearch :: Query -> Limit -> Depth -> IO [Maybe Article]
+main :: IO ()
+main = do
+  articles <- searxSearch "abeille" 10 1
+  print $ articles
--- a/inline-js @ ad33fe42
+++ b/inline-js @ ad33fe42
+Subproject commit ad33fe42821d146a6dc8c18c4f31ee7ba27e3fa4
--- a/package.yaml
+++ b/package.yaml
+name:                crawlerSEARX
+version:             0.1.0.0
+github:              "https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/searx"
+license:             BSD3
+author:              "Mudada"
+maintainer:          "mael.nicolas77@gmail.com"
+copyright:           "Mudada"
+
+extra-source-files:
+- README.md
+- ChangeLog.md
+
+# Metadata used when publishing your package
+# synopsis:            Short description of your package
+# category:            Web
+
+# To avoid duplicated efforts in documentation and dealing with the
+# complications of embedding Haddock markup inside cabal files, it is
+# common to point users to the README.md file.
+description:         Please see the README on GitHub at <https://github.com/Mudada/searx#readme>
+
+dependencies:
+- aeson
+- base >= 4.7 && < 5
+- text
+- lens
+- servant
+- servant-client
+- http-client
+- http-client-tls
+- inline-js
+- inline-js-core
+- process
+- directory
+- temporary
+
+library:
+  source-dirs: src
+
+executables:
+  crawlerSEARX-exe:
+    main:                Main.hs
+    source-dirs:         app
+    ghc-options:
+    - -threaded
+    - -rtsopts
+    - -with-rtsopts=-N
+    dependencies:
+      - crawlerSEARX
+
+tests:
+  searx-test:
+    main:                Spec.hs
+    source-dirs:         test
+    ghc-options:
+    - -threaded
+    - -rtsopts
+    - -with-rtsopts=-N
+    dependencies:
+    - searx
--- a/src/SEARX.hs
+++ b/src/SEARX.hs
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE TemplateHaskell #-}
+{-# LANGUAGE QuasiQuotes #-}
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE DeriveAnyClass #-}
+
+module SEARX where
+
+import Network.HTTP.Client (newManager)
+import Network.HTTP.Client.TLS (tlsManagerSettings)
+
+import Language.JavaScript.Inline
+
+import System.Directory
+import System.IO.Temp
+import System.Process
+
+import Servant.Client
+
+import SEARX.Client
+
+import Data.Foldable
+import qualified Data.Text as T
+import Data.Aeson
+import GHC.Generics
+
+{-
+debug purpose, use getMetadataWith
+getMetadataWith2 :: T.Text -> IO (Either ClientError Documents)
+getMetadataWith2 q = do
+  manager' <- newManager tlsManagerSettings
+  runClientM
+    (search (Just q) (Just 1) (Just 3) (Just "") (Just All) (Just Json))
+    (mkClientEnv manager' $ BaseUrl Https "search.iscpif.fr" 443 "")
+
+          read($url, function(err, article, meta) {
+            // Title
+            article.close();
+-}
+
+data Article =
+  Article {title :: Maybe T.Text
+          ,byline :: Maybe T.Text
+          ,dir :: Maybe T.Text
+          ,content :: Maybe T.Text
+          ,textContent :: Maybe T.Text
+          ,length :: Maybe Int
+          ,excerpt :: Maybe T.Text
+          ,siteName :: Maybe T.Text
+          }
+  deriving (Generic, Show, FromJSON, ToJSON)
+
+setUpDirectory :: IO FilePath
+setUpDirectory =
+  do
+    dir <- getCurrentDirectory
+    _ <- withCurrentDirectory dir $
+      traverse_
+      callCommand
+      [ "npm init --yes"
+      , "npm install --save jsdom request-promise-native"
+      , "git clone https://github.com/mozilla/readability.git"
+      ]
+    getCurrentDirectory
+
+
+parseWebsite :: FilePath -> T.Text -> IO (Maybe Article)
+parseWebsite tmpdir url =
+  do
+    withJSSession
+      defJSSessionOpts {nodeWorkDir = Just tmpdir}
+      [block|
+            var {JSDOM} = require('jsdom');
+            var rp = require('request-promise-native');
+            var Readability = require('./readability');
+            try {
+                const val = await rp($url);
+                const doc = new JSDOM(val, {url: $url})
+                const reader = new Readability(doc.window.document);
+                return reader.parse();
+            } catch (err) {
+                return null;
+            }
+            |]
+
+specConcatEith :: Semigroup a => Either b a -> Either b a -> Either b a
+specConcatEith (Left _) b = b
+specConcatEith a (Left _) = a
+specConcatEith (Right a) (Right b) = Right $ a <> b
+
+
+getMetadataWith :: T.Text -> Int -> IO (Either ClientError Documents)
+getMetadataWith q i = do
+  manager' <- newManager tlsManagerSettings
+  getMetadataWith' manager' q i 1
+  where client' man = mkClientEnv man $ BaseUrl Https "search.iscpif.fr" 443 ""
+        search' r' p' = search (Just r') (Just 1) (Just p') (Just "") (Just All) (Just Json)
+        getMetadataWith' man req maxP curP
+          | maxP >= curP = do
+              val <- runClientM (search' req curP) (client' man)
+              nextVal <- getMetadataWith' man req maxP (curP + 1)
+              return $ specConcatEith val nextVal
+          | otherwise = return (Right $ Documents 0 [])
--- a/src/SEARX/Client.hs
+++ b/src/SEARX/Client.hs
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE TypeOperators #-}
+{-# LANGUAGE TemplateHaskell #-}
+
+module SEARX.Client where
+
+
+import GHC.Generics
+import Data.Aeson
+import Control.Applicative ((<|>))
+
+import Servant.API
+import Servant.Client
+import Data.Proxy (Proxy(..))
+
+import qualified Data.Text as T
+import qualified Control.Lens as L
+
+data Document = Document
+  {
+    _document_id :: T.Text,
+    _document_title :: Maybe T.Text,
+    _document_abstract :: Maybe T.Text,
+    _document_publicationDate :: Maybe T.Text,
+    _document_sources :: [T.Text]
+  } deriving (Show, Generic)
+L.makeLenses ''Document
+
+instance FromJSON Document where
+  parseJSON (Object o) =
+    Document <$>
+    (o .: "url")
+    <*> (o .:? "title")
+    <*> (o .:? "content")
+    <*> (o .:? "pubdate")
+    <*> (o .: "engines" <|> pure [])
+
+data Documents = Documents
+  {
+    _documents_total :: Int,
+    _documents_hits :: [Document]
+  } deriving (Show, Generic)
+L.makeLenses ''Documents
+
+instance Semigroup Documents where
+  a <> b = Documents
+    ((_documents_total a) + (_documents_total b))
+    ((_documents_hits a) <> (_documents_hits b))
+
+instance FromJSON Documents where
+  parseJSON (Object o) =
+    Documents <$> (o .: "number_of_results") <*> (o .: "results")
+
+
+data Format = Json
+  deriving (Show, Generic)
+
+instance ToHttpApiData Format where
+  toUrlPiece (Json) = "json"
+
+
+data Language = All
+  deriving (Show, Generic)
+
+instance ToHttpApiData Language where
+  toUrlPiece (All) = "all"
+
+
+type Search =
+  QueryParam "q" T.Text
+  :> QueryParam "category_news" Int
+  :> QueryParam "pageno" Int
+  :> QueryParam "time_range" T.Text
+  :> QueryParam "language" Language
+  :> QueryParam "format" Format
+  :> Post '[JSON] Documents
+
+type SEARXAPI = Search
+
+searxProxy :: Proxy (SEARXAPI)
+searxProxy = Proxy
+
+search :: Maybe T.Text -> Maybe Int -> Maybe Int -> Maybe T.Text -> Maybe Language -> Maybe Format -> ClientM Documents
+search = client searxProxy
--- a/stack.yaml
+++ b/stack.yaml
+# This file was automatically generated by 'stack init'
+#
+# Some commonly used options have been documented as comments in this file.
+# For advanced use and comprehensive documentation of the format, please see:
+# https://docs.haskellstack.org/en/stable/yaml_configuration/
+
+# Resolver to choose a 'specific' stackage snapshot or a compiler version.
+# A snapshot resolver dictates the compiler version and the set of packages
+# to be used for project dependencies. For example:
+#
+# resolver: lts-3.5
+# resolver: nightly-2015-09-21
+# resolver: ghc-7.10.2
+#
+# The location of a snapshot can be provided as a file or url. Stack assumes
+# a snapshot provided as a file might change, whereas a url resource does not.
+#
+# resolver: ./custom-snapshot.yaml
+# resolver: https://example.com/snapshots/2018-01-01.yaml
+resolver: lts-14.4
+
+# User packages to be built.
+# Various formats can be used as shown in the example below.
+#
+# packages:
+# - some-directory
+# - https://example.com/foo/bar/baz-0.0.2.tar.gz
+# - location:
+#    git: https://github.com/commercialhaskell/stack.git
+#    commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a
+# - location: https://github.com/commercialhaskell/stack/commit/e7b331f14bcffb8367cd58fbfc8b40ec7642100a
+#  subdirs:
+#  - auto-update
+#  - wai
+packages:
+- .
+
+# Dependency packages to be pulled from upstream that are not in the resolver
+# using the same syntax as the packages field.
+# (e.g., acme-missiles-0.3)
+extra-deps:
+  - ./inline-js/inline-js-core
+  - ./inline-js/inline-js
+
+# Override default flag values for local packages and extra-deps
+# flags: {}
+
+# Extra package databases containing global packages
+# extra-package-dbs: []
+
+# Control whether we use the GHC we find on the path
+# system-ghc: true
+#
+# Require a specific version of stack, using version ranges
+# require-stack-version: -any # Default
+# require-stack-version: ">=1.9"
+#
+# Override the architecture used by stack, especially useful on Windows
+# arch: i386
+# arch: x86_64
+#
+# Extra directories used by stack for building
+# extra-include-dirs: [/path/to/dir]
+# extra-lib-dirs: [/path/to/dir]
+#
+# Allow a newer minor version of GHC than the snapshot specifies
+# compiler-check: newer-minor
--- a/test/Spec.hs
+++ b/test/Spec.hs
+main :: IO ()
+main = putStrLn "Test suite not yet implemented"