Commit 234d84ce authored by delanoe's avatar delanoe

Merge branch 'dev' into 'master'

Basic functionnality and README Documentation

See merge request !4
parents 07d1892d a03304f5
[submodule "inline-js"]
path = inline-js
url = https://github.com/tweag/inline-js
# Changelog for searx
## Unreleased changes
Copyright Mudada (c) 2019
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Mudada nor the names of other
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# searx
# SEARX Crawler
[searx](https://en.wikipedia.org/wiki/Searx) is a privacy-respecting, hackable metasearch engine.
## Base website
https://searx.me/
## Based on
https://gitlab.iscpif.fr/smurail/gargantext-light/blob/simon-gargantext-light/gargantext/scrapers/searx.py
## Main function to get main results through metadata
## Usage
### Entry function
The basic entry point of this crawler is the function `SEARX.` `getMetadataWith`.
getMetadataWith "artificial intelligence" (Just 10)
This function take a `Text` representing the query you want to run on searx
and a `Int` representing the maximum number of pages you want to get.
### Return Type
The return type is pretty simple, it's **either** a `ClientError` or a `ISTEX.` `Client.Documents`.
`Documents` represent a collection of `ISTEX.` `Client.Document` and the number of `Document` returned.
`Document` is a simple type that contain nearly every informations we need (id,title,abstract,publicationDate,sources).
**authors** is not present since it's not contained in the searx database.
### Exemple
Here is a basic main using the entry point of the crawler and printing the 5 first results.
```hs
{-# LANGUAGE OverloadedStrings #-}
module Main where
import SEARX
main :: IO ()
main = do
res <- getMetadataWith "ia" 300
case res of
(Left err) -> print err
(Right r) -> print $ take 5 $ _documents_hits r
```
import Distribution.Simple
main = defaultMain
{-# LANGUAGE OverloadedStrings #-}
module Main where
import SEARX
import SEARX.Client
import System.Directory
import Text.HTML.TagSoup
import Data.Maybe
import qualified Data.Text as T
emptyArticle :: Article
emptyArticle = Article
{
title = Nothing
,byline = Nothing
,dir = Nothing
,content = Nothing
,textContent = Nothing
,SEARX.length = Nothing
,excerpt = Nothing
,siteName = Nothing
}
type Depth = Int
type Limit = Int
type Query = T.Text
searxSearch :: Query -> Limit -> Depth -> IO [Maybe Article]
searxSearch q l d = do
res <- getMetadataWith q l
case res of
(Left err) -> return []
(Right r) -> do
let urls = take l $ _document_id <$> (_documents_hits r)
parseWebsiteReq d urls
parseWebsiteReqWithFp :: FilePath -> Depth -> [T.Text] -> IO [Maybe Article]
parseWebsiteReqWithFp fp d urls
| d <= 0 = do
art <- parseWebsite'
return art
| otherwise = do
articles <- parseWebsite'
d <-
parseWebsiteReqWithFp fp
(d - 1)
(getUrlsFromWebsite articles)
return $ d <> articles
where parseWebsite' =
sequence $ parseWebsite fp <$> urls
parseWebsiteReq :: Depth -> [T.Text] -> IO [Maybe Article]
parseWebsiteReq d urls = do
fp <- setUpDirectory
articles <- parseWebsiteReqWithFp fp d urls
removeDirectoryRecursive $ fp <> "/readability"
return articles
getUrlsFromWebsite :: [Maybe Article] -> [T.Text]
getUrlsFromWebsite articles =
(filter (/= ""))
$ (fromAttrib "href")
<$> (filter isTagOpen
$ concat $ parseTags
<$> ((fromMaybe ""
. content
. fromMaybe emptyArticle)
<$> articles))
-- searxSearch :: Query -> Limit -> Depth -> IO [Maybe Article]
main :: IO ()
main = do
articles <- searxSearch "abeille" 10 1
print $ articles
Subproject commit ad33fe42821d146a6dc8c18c4f31ee7ba27e3fa4
name: crawlerSEARX
version: 0.1.0.0
github: "https://git@gitlab.iscpif.fr:20022/gargantext/crawlers/searx"
license: BSD3
author: "Mudada"
maintainer: "mael.nicolas77@gmail.com"
copyright: "Mudada"
extra-source-files:
- README.md
- ChangeLog.md
# Metadata used when publishing your package
# synopsis: Short description of your package
# category: Web
# To avoid duplicated efforts in documentation and dealing with the
# complications of embedding Haddock markup inside cabal files, it is
# common to point users to the README.md file.
description: Please see the README on GitHub at <https://github.com/Mudada/searx#readme>
dependencies:
- aeson
- base >= 4.7 && < 5
- text
- lens
- servant
- servant-client
- http-client
- http-client-tls
- inline-js
- inline-js-core
- process
- directory
- temporary
library:
source-dirs: src
executables:
crawlerSEARX-exe:
main: Main.hs
source-dirs: app
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
dependencies:
- crawlerSEARX
tests:
searx-test:
main: Spec.hs
source-dirs: test
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
dependencies:
- searx
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DeriveAnyClass #-}
module SEARX where
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Language.JavaScript.Inline
import System.Directory
import System.IO.Temp
import System.Process
import Servant.Client
import SEARX.Client
import Data.Foldable
import qualified Data.Text as T
import Data.Aeson
import GHC.Generics
{-
debug purpose, use getMetadataWith
getMetadataWith2 :: T.Text -> IO (Either ClientError Documents)
getMetadataWith2 q = do
manager' <- newManager tlsManagerSettings
runClientM
(search (Just q) (Just 1) (Just 3) (Just "") (Just All) (Just Json))
(mkClientEnv manager' $ BaseUrl Https "search.iscpif.fr" 443 "")
read($url, function(err, article, meta) {
// Title
article.close();
-}
data Article =
Article {title :: Maybe T.Text
,byline :: Maybe T.Text
,dir :: Maybe T.Text
,content :: Maybe T.Text
,textContent :: Maybe T.Text
,length :: Maybe Int
,excerpt :: Maybe T.Text
,siteName :: Maybe T.Text
}
deriving (Generic, Show, FromJSON, ToJSON)
setUpDirectory :: IO FilePath
setUpDirectory =
do
dir <- getCurrentDirectory
_ <- withCurrentDirectory dir $
traverse_
callCommand
[ "npm init --yes"
, "npm install --save jsdom request-promise-native"
, "git clone https://github.com/mozilla/readability.git"
]
getCurrentDirectory
parseWebsite :: FilePath -> T.Text -> IO (Maybe Article)
parseWebsite tmpdir url =
do
withJSSession
defJSSessionOpts {nodeWorkDir = Just tmpdir}
[block|
var {JSDOM} = require('jsdom');
var rp = require('request-promise-native');
var Readability = require('./readability');
try {
const val = await rp($url);
const doc = new JSDOM(val, {url: $url})
const reader = new Readability(doc.window.document);
return reader.parse();
} catch (err) {
return null;
}
|]
specConcatEith :: Semigroup a => Either b a -> Either b a -> Either b a
specConcatEith (Left _) b = b
specConcatEith a (Left _) = a
specConcatEith (Right a) (Right b) = Right $ a <> b
getMetadataWith :: T.Text -> Int -> IO (Either ClientError Documents)
getMetadataWith q i = do
manager' <- newManager tlsManagerSettings
getMetadataWith' manager' q i 1
where client' man = mkClientEnv man $ BaseUrl Https "search.iscpif.fr" 443 ""
search' r' p' = search (Just r') (Just 1) (Just p') (Just "") (Just All) (Just Json)
getMetadataWith' man req maxP curP
| maxP >= curP = do
val <- runClientM (search' req curP) (client' man)
nextVal <- getMetadataWith' man req maxP (curP + 1)
return $ specConcatEith val nextVal
| otherwise = return (Right $ Documents 0 [])
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE TemplateHaskell #-}
module SEARX.Client where
import GHC.Generics
import Data.Aeson
import Control.Applicative ((<|>))
import Servant.API
import Servant.Client
import Data.Proxy (Proxy(..))
import qualified Data.Text as T
import qualified Control.Lens as L
data Document = Document
{
_document_id :: T.Text,
_document_title :: Maybe T.Text,
_document_abstract :: Maybe T.Text,
_document_publicationDate :: Maybe T.Text,
_document_sources :: [T.Text]
} deriving (Show, Generic)
L.makeLenses ''Document
instance FromJSON Document where
parseJSON (Object o) =
Document <$>
(o .: "url")
<*> (o .:? "title")
<*> (o .:? "content")
<*> (o .:? "pubdate")
<*> (o .: "engines" <|> pure [])
data Documents = Documents
{
_documents_total :: Int,
_documents_hits :: [Document]
} deriving (Show, Generic)
L.makeLenses ''Documents
instance Semigroup Documents where
a <> b = Documents
((_documents_total a) + (_documents_total b))
((_documents_hits a) <> (_documents_hits b))
instance FromJSON Documents where
parseJSON (Object o) =
Documents <$> (o .: "number_of_results") <*> (o .: "results")
data Format = Json
deriving (Show, Generic)
instance ToHttpApiData Format where
toUrlPiece (Json) = "json"
data Language = All
deriving (Show, Generic)
instance ToHttpApiData Language where
toUrlPiece (All) = "all"
type Search =
QueryParam "q" T.Text
:> QueryParam "category_news" Int
:> QueryParam "pageno" Int
:> QueryParam "time_range" T.Text
:> QueryParam "language" Language
:> QueryParam "format" Format
:> Post '[JSON] Documents
type SEARXAPI = Search
searxProxy :: Proxy (SEARXAPI)
searxProxy = Proxy
search :: Maybe T.Text -> Maybe Int -> Maybe Int -> Maybe T.Text -> Maybe Language -> Maybe Format -> ClientM Documents
search = client searxProxy
# This file was automatically generated by 'stack init'
#
# Some commonly used options have been documented as comments in this file.
# For advanced use and comprehensive documentation of the format, please see:
# https://docs.haskellstack.org/en/stable/yaml_configuration/
# Resolver to choose a 'specific' stackage snapshot or a compiler version.
# A snapshot resolver dictates the compiler version and the set of packages
# to be used for project dependencies. For example:
#
# resolver: lts-3.5
# resolver: nightly-2015-09-21
# resolver: ghc-7.10.2
#
# The location of a snapshot can be provided as a file or url. Stack assumes
# a snapshot provided as a file might change, whereas a url resource does not.
#
# resolver: ./custom-snapshot.yaml
# resolver: https://example.com/snapshots/2018-01-01.yaml
resolver: lts-14.4
# User packages to be built.
# Various formats can be used as shown in the example below.
#
# packages:
# - some-directory
# - https://example.com/foo/bar/baz-0.0.2.tar.gz
# - location:
# git: https://github.com/commercialhaskell/stack.git
# commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a
# - location: https://github.com/commercialhaskell/stack/commit/e7b331f14bcffb8367cd58fbfc8b40ec7642100a
# subdirs:
# - auto-update
# - wai
packages:
- .
# Dependency packages to be pulled from upstream that are not in the resolver
# using the same syntax as the packages field.
# (e.g., acme-missiles-0.3)
extra-deps:
- ./inline-js/inline-js-core
- ./inline-js/inline-js
# Override default flag values for local packages and extra-deps
# flags: {}
# Extra package databases containing global packages
# extra-package-dbs: []
# Control whether we use the GHC we find on the path
# system-ghc: true
#
# Require a specific version of stack, using version ranges
# require-stack-version: -any # Default
# require-stack-version: ">=1.9"
#
# Override the architecture used by stack, especially useful on Windows
# arch: i386
# arch: x86_64
#
# Extra directories used by stack for building
# extra-include-dirs: [/path/to/dir]
# extra-lib-dirs: [/path/to/dir]
#
# Allow a newer minor version of GHC than the snapshot specifies
# compiler-check: newer-minor
main :: IO ()
main = putStrLn "Test suite not yet implemented"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment