Commit 4d6ae5aa authored by Alexandre Delanoë's avatar Alexandre Delanoë

[LIB] arxiv integration to manage its evolutions

parent f3e517cc
...@@ -26,6 +26,7 @@ source-repository head ...@@ -26,6 +26,7 @@ source-repository head
library library
exposed-modules: exposed-modules:
Arxiv Arxiv
Network.Api.Arxiv
other-modules: other-modules:
Paths_crawlerArxiv Paths_crawlerArxiv
hs-source-dirs: hs-source-dirs:
...@@ -33,15 +34,16 @@ library ...@@ -33,15 +34,16 @@ library
default-extensions: default-extensions:
RecordWildCards RecordWildCards
build-depends: build-depends:
arxiv base >=4.7 && <5
, base >=4.7 && <5
, bytestring , bytestring
, conduit , conduit
, http-conduit , http-conduit
, http-types , http-types
, mtl , mtl
, network , network
, parsec
, resourcet , resourcet
, split
, tagsoup , tagsoup
, text , text
default-language: Haskell2010 default-language: Haskell2010
...@@ -56,8 +58,7 @@ executable arxiv-exe ...@@ -56,8 +58,7 @@ executable arxiv-exe
RecordWildCards RecordWildCards
ghc-options: -threaded -rtsopts -with-rtsopts=-N ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: build-depends:
arxiv base >=4.7 && <5
, base >=4.7 && <5
, bytestring , bytestring
, conduit , conduit
, crawlerArxiv , crawlerArxiv
...@@ -65,7 +66,9 @@ executable arxiv-exe ...@@ -65,7 +66,9 @@ executable arxiv-exe
, http-types , http-types
, mtl , mtl
, network , network
, parsec
, resourcet , resourcet
, split
, tagsoup , tagsoup
, text , text
default-language: Haskell2010 default-language: Haskell2010
...@@ -89,7 +92,9 @@ test-suite arxiv-test ...@@ -89,7 +92,9 @@ test-suite arxiv-test
, http-types , http-types
, mtl , mtl
, network , network
, parsec
, resourcet , resourcet
, split
, tagsoup , tagsoup
, text , text
default-language: Haskell2010 default-language: Haskell2010
...@@ -26,7 +26,6 @@ default-extensions: ...@@ -26,7 +26,6 @@ default-extensions:
- RecordWildCards - RecordWildCards
dependencies: dependencies:
- arxiv
- base >= 4.7 && < 5 - base >= 4.7 && < 5
- bytestring - bytestring
- conduit - conduit
...@@ -37,11 +36,11 @@ dependencies: ...@@ -37,11 +36,11 @@ dependencies:
- resourcet - resourcet
- tagsoup - tagsoup
- text - text
- parsec
- split
library: library:
source-dirs: src source-dirs: src
dependencies:
- arxiv
executables: executables:
arxiv-exe: arxiv-exe:
...@@ -52,7 +51,6 @@ executables: ...@@ -52,7 +51,6 @@ executables:
- -rtsopts - -rtsopts
- -with-rtsopts=-N - -with-rtsopts=-N
dependencies: dependencies:
- arxiv
- crawlerArxiv - crawlerArxiv
tests: tests:
......
...@@ -27,7 +27,7 @@ type Limit = Int ...@@ -27,7 +27,7 @@ type Limit = Int
type Soup = Tag String type Soup = Tag String
batchSize :: Int batchSize :: Int
batchSize = 500 batchSize = 3000
------------------------------------------------------------ ------------------------------------------------------------
......
-------------------------------------------------------------------------------
-- |
-- Module : Network.Api.Arxiv.hs
-- Copyright : (c) Tobias Schoofs
-- License : LGPL
-- Stability : experimental
-- Portability: portable
--
-- The ArXiv API is split in two parts:
-- Request and Response.
-- The Request part contains
-- a simple language to define queries,
-- a query parser and some helpers to navigate
-- through the results of a mutlipage query
-- (which, in fact, induces a new query).
--
-- The Response part contains
-- an API to access the fields of the result
-- based on TagSoup.
--
-- This library does not contain functions
-- to actually execute and manage http requests.
-- It is intended to be used with existing
-- http libraries such as http-conduit.
-- An example how to use the ArXiv library
-- with http-conduit is included in this documentation.
-------------------------------------------------------------------------------
module Network.Api.Arxiv (
-- * Request
-- $RequestOv
baseUrl, apiUrl, apiQuery,
Field(..), Expression(..),
(/*/), (/+/), (/-/),
-- * Expression Example
-- $ExpExample
-- * Queries
Query(..), nextPage,
parseQuery, preprocess, parseIds,
mkQuery, exp2str, items2str, ids2str,
itemControl,
-- * Response
-- $ResponseOv
totalResults, startIndex, itemsPerPage,
getEntry, forEachEntry, forEachEntryM, forEachEntryM_,
checkForError, exhausted,
getId, getIdUrl, getUpdated, getPublished, getYear,
getTitle, getSummary,
getComment, getJournal, getDoi,
Link(..),
getLinks, getPdfLink, getPdf,
Category(..),
getCategories, getPrimaryCategory,
Author(..),
getAuthors, getAuthorNames
-- * A complete Example using http-conduit
-- $CompleteExample
)
where
import Text.HTML.TagSoup
import Text.Parsec
import Data.Char (isDigit)
import Data.Maybe (fromMaybe)
import Data.List (find, intercalate)
import qualified Data.List.Split as S
import Control.Applicative ((<$>))
import Control.Monad (void)
------------------------------------------------------------------------
-- import Debug.Trace (trace)
------------------------------------------------------------------------
{- $ExpExample
Expressions are intended to ease the construction
of well-formed queries in application code.
A simple example:
> let au = Exp $ Au ["Knuth"]
> t1 = Exp $ Ti ["The Art of Computer Programming"]
> t2 = Exp $ Ti ["Concrete Mathematics"]
> ex = au /*/ (t1 /+/ t2)
> in ...
-}
{- $RequestOv
Requests are URL parameters,
either \"search_query\" or \"id_list\".
This module provides functions
to build and parse these parameters,
to create the full request string
and to navigate through a multi-page request
with a maximum number of items per page.
For details of the Arxiv request format,
please refer to the Arxiv documentation.
-}
{- $ResponseOv
Response processing expects [Tag String] as input (see TagSoup).
The result produced by your http library
(such as http-conduit) must be converted to [Tag String]
before the result is passed to the response functions
defined here (see also the example below).
The response functions extract information from the tag soup,
either in 'String', 'Int' or TagSoup format.
For details of the Arxiv Feed format, please refer
to the Arxiv documentation.
-}
{- $CompleteExample
> module Main
> where
>
> import qualified Network.Api.Arxiv as Ax
> import Network.Api.Arxiv (Expression(..),
> Field(..), (/*/), (/+/))
> import Network.Socket (withSocketsDo)
> import Network.HTTP.Simple as HT
> import Network.HTTP.Conduit (parseRequest)
> import Network.HTTP.Types.Status
> import Data.List (intercalate)
> import qualified Data.ByteString as B hiding (unpack)
> import qualified Data.ByteString.Char8 as B (unpack)
> import Data.Conduit ((.|))
> import qualified Data.Conduit as C
> import qualified Data.Conduit.List as CL
> import Data.Function ((&))
> import Text.HTML.TagSoup
> import Control.Monad.Trans (liftIO)
> import Control.Monad.Trans.Resource (MonadResource)
> import Control.Applicative ((<$>))
>
> main :: IO ()
> main = withSocketsDo (execQuery makeQuery)
>
> makeQuery :: Ax.Query
> makeQuery =
> let au = Exp $ Au ["Aaronson"]
> t1 = Exp $ Ti ["quantum"]
> t2 = Exp $ Ti ["complexity"]
> x = au /*/ (t1 /+/ t2)
> in Ax.Query {
> Ax.qExp = Just x,
> Ax.qIds = [],
> Ax.qStart = 0,
> Ax.qItems = 25}
>
> type Soup = Tag String
>
> execQuery :: Ax.Query -> IO ()
> execQuery q = C.runConduitRes (searchAxv q .| outSnk)
>
> ----------------------------------------------------------------------
> -- Execute query and start a source
> ----------------------------------------------------------------------
> searchAxv :: MonadResource m => Ax.Query -> C.ConduitT () String m ()
> searchAxv q =
> let s = Ax.mkQuery q
> in do rsp <- HT.httpBS =<< liftIO (parseRequest s)
> case getResponseStatus rsp of
> (Status 200 _) -> getSoup (getResponseBody rsp)
> >>= results q
> st -> error $ "Error:" ++ show st
>
> ----------------------------------------------------------------------
> -- Consume page by page
> ----------------------------------------------------------------------
> getSoup :: MonadResource m =>
> B.ByteString -> C.ConduitT () String m [Soup]
> getSoup b = concat <$> (C.yield b .| toSoup .| CL.consume)
>
> ----------------------------------------------------------------------
> -- Receive a ByteString and yield Soup
> ----------------------------------------------------------------------
> toSoup :: MonadResource m => C.ConduitT B.ByteString [Soup] m ()
> toSoup = C.awaitForever (C.yield . parseTags . B.unpack)
>
> ----------------------------------------------------------------------
> -- Yield all entries and fetch next page
> ----------------------------------------------------------------------
> results :: MonadResource m =>
> Ax.Query -> [Soup] -> C.ConduitT () String m ()
> results q sp =
> if Ax.exhausted sp
> then C.yield ("EOT: " ++ show (Ax.totalResults sp) ++ " results")
> else Ax.forEachEntryM_ sp (C.yield . mkResult)
> >> searchAxv (Ax.nextPage q)
>
> ----------------------------------------------------------------------
> -- Get data and format
> ----------------------------------------------------------------------
> mkResult :: [Soup] -> String
> mkResult sp = let aus = Ax.getAuthorNames sp
> y = Ax.getYear sp
> tmp = Ax.getTitle sp & clean ['\n', '\r', '\t']
> ti = if null tmp then "No title" else tmp
> in intercalate ", " aus ++ " (" ++ y ++ "): " ++ ti
> where clean _ [] = []
> clean d (c:cs) | c `elem` d = clean d cs
> | otherwise = c:clean d cs
>
> ----------------------------------------------------------------------
> -- Sink results
> ----------------------------------------------------------------------
> outSnk :: MonadResource m => C.ConduitT String C.Void m ()
> outSnk = C.awaitForever (liftIO . putStrLn)
-}
------------------------------------------------------------------------
-- | The Arxiv base URL \"arxiv.org\"
------------------------------------------------------------------------
baseUrl :: String
baseUrl = "arxiv.org"
------------------------------------------------------------------------
-- | The Arxiv API URL \"export.arxiv.org/api\"
------------------------------------------------------------------------
apiUrl :: String
apiUrl = "https://export.arxiv.org/api/query?"
------------------------------------------------------------------------
-- | The query string (\"search_query=\" or \"id_list=\")
------------------------------------------------------------------------
apiQuery,apiIdList :: String
apiQuery = "search_query="
apiIdList = "id_list="
------------------------------------------------------------------------
-- | Field data type;
-- a field consist of a field identifier
-- (author, title, etc.)
-- and a list of search terms.
------------------------------------------------------------------------
data Field =
-- | Title
Ti [Term]
-- | Author
| Au [Term]
-- | Abstract
| Abs [Term]
-- | Comment
| Co [Term]
-- | Journal
| Jr [Term]
-- | Category
| Cat [Term]
-- | Report Number
| Rn [Term]
-- | Article identifier
| Id [Term]
-- | Any of the above
| All [Term]
deriving (Eq, Show)
------------------------------------------------------------------------
-- | A term is just a string
------------------------------------------------------------------------
type Term = String
------------------------------------------------------------------------
-- convert a field to a string
------------------------------------------------------------------------
field2str :: Field -> String
field2str (Ti s) = "ti:" ++ terms2str s
field2str (Au s) = "au:" ++ terms2str s
field2str (Abs s) = "abs:" ++ terms2str s
field2str (Co s) = "co:" ++ terms2str s
field2str (Jr s) = "jr:" ++ terms2str s
field2str (Cat s) = "cat:" ++ terms2str s
field2str (Rn s) = "rn:" ++ terms2str s
field2str (Id s) = "id:" ++ terms2str s
field2str (All s) = "all:" ++ terms2str s
------------------------------------------------------------------------
-- convert a term to a string
------------------------------------------------------------------------
terms2str :: [Term] -> String
terms2str = intercalate "+" . map term2str
where term2str t =
let x = intercalate "+" (words t)
in if '+' `elem` x then "%22" ++ x ++ "%22" else x
------------------------------------------------------------------------
-- | Expression data type.
-- An expression is either a field or a logical connection
-- of two expressions using the basic operators
-- AND, OR and ANDNOT.
------------------------------------------------------------------------
data Expression =
-- | Just a field
Exp Field
-- | Logical \"and\"
| And Expression Expression
-- | Logical \"or\"
| Or Expression Expression
-- | Logical \"and . not\"
| AndNot Expression Expression
deriving (Eq, Show)
------------------------------------------------------------------------
-- | AND operator.
-- The symbol was chosen because
-- 0 * 1 = 0.
------------------------------------------------------------------------
infix /*/
(/*/) :: Expression -> Expression -> Expression
(/*/) = And
------------------------------------------------------------------------
-- | OR operator.
-- The symbol was chosen because
-- 0 + 1 = 1.
------------------------------------------------------------------------
infix /+/
(/+/) :: Expression -> Expression -> Expression
(/+/) = Or
------------------------------------------------------------------------
-- | ANDNOT operator.
-- The symbol was chosen because
-- 1 - 1 = 0 and 1 - 0 = 1.
------------------------------------------------------------------------
infix /-/
(/-/) :: Expression -> Expression -> Expression
(/-/) = AndNot
------------------------------------------------------------------------
-- | Create a query string from an expression.
-- Note that we create redundant parentheses,
-- for instance \"a AND b OR c\" will be encoded as
-- \"a+AND+%28b+OR+c%29\".
-- The rationale is that the API specification is not clear
-- on how expressions are parsed.
-- The above expression could be understood as
-- \"a AND (b OR c)\" or as \"(a AND b) OR c\".
-- To avoid confusion, one should always use parentheses
-- to group boolean expressions - even if some of these parentheses
-- appear to be redundant under one or the other parsing strategy.
------------------------------------------------------------------------
exp2str, innerExp2str :: Expression -> String
exp2str (Exp f) = field2str f
exp2str (And e1 e2) = innerExp2str e1 ++ "+AND+" ++ innerExp2str e2
exp2str (Or e1 e2) = innerExp2str e1 ++ "+OR+" ++ innerExp2str e2
exp2str (AndNot e1 e2) = innerExp2str e1 ++ "+ANDNOT+" ++ innerExp2str e2
innerExp2str (Exp f) = exp2str (Exp f)
innerExp2str e = "%28" ++ exp2str e ++ "%29"
type Identifier = String
------------------------------------------------------------------------
-- | Query data type.
--
-- You usually want to create a query like:
--
-- > let e = (Exp $ Au ["Aaronson"]) /*/ (
-- > (Exp $ Ti ["quantum"]) /+/
-- > (Exp $ Ti ["complexity"]))
-- > in Query {
-- > qExp = Just e,
-- > qIds = ["0902.3175v2","1406.2858v1","0912.3825v1"],
-- > qStart = 0,
-- > qItems = 10}
------------------------------------------------------------------------
data Query = Query {
-- | The query expression
qExp :: Maybe Expression,
-- | Id List
qIds :: [Identifier],
-- | The first item we want to see
qStart :: Int,
-- | The number of items we want to see
qItems :: Int}
deriving (Eq, Show)
-------------------------------------------------------------------------
-- | Prepares the query to fetch
-- the next page adding \"items per page\" to \"start index\".
-------------------------------------------------------------------------
nextPage :: Query -> Query
nextPage q = let s = qStart q
i = qItems q
in q{qStart = s + i}
-------------------------------------------------------------------------
-- | Checks whether the query is exhausted or not, i.e.
-- whether all pages have been fetched already.
-- The first argument is the entire response (not just a part of it).
-------------------------------------------------------------------------
exhausted :: [Tag String] -> Bool
exhausted sp = startIndex sp >= totalResults sp
-------------------------------------------------------------------------
-- | Parses an expression from a string.
-- Please refer to the Arxiv documentation for details
-- on query format.
--
-- Just a minor remark here:
-- The operators OR, AND and ANDNOT are case sensitive.
-- \"andnot\" would be interpreted as part of a title, for instance:
-- \"ti:functional+andnot+object+oriented\" is just one title;
-- \"ti:functional+ANDNOT+object+oriented\" would cause an error,
-- because a field identifier (ti, au, etc.) is expected after
-- \"+ANDNOT+\".
--
-- The other way round: the field content itself
-- is not case sensitive, i.e.
-- \"ti:complexity\" or \"au:aaronson\" is the same as
-- \"ti:Complexity\" and \"au:Aaronson\" respectively.
-- This is a feature of the very arXiv API.
--
-- You may want to refer to the comments under
-- 'preprocess' and 'exp2str' for some more details
-- on our interpretation of the Arxiv documentation.
-------------------------------------------------------------------------
parseQuery :: String -> Either String Expression
parseQuery s = case parse expression "" $ preprocess s of
Left e -> Left $ show e
Right e -> Right e
-------------------------------------------------------------------------
-- | Converts a string containing comma-separated identifiers
-- into a list of 'Identifier's.
-- As stated already: No whitespace!
-------------------------------------------------------------------------
parseIds :: String -> [Identifier]
parseIds = S.endBy ","
-------------------------------------------------------------------------
-- | This is an internal function used by 'parseQuery'.
-- It may be occasionally useful for direct use:
-- It replaces \" \", \"(\", \")\" and \"\"\"
-- by \"+\", \"%28\", \"%29\" and \"%22\"
-- respectively.
--
-- Usually, these substitutions are performed
-- when transforming a string to an URL, which should be done
-- by your http library anyway (e.g. http-conduit).
-- But this step is usually after parsing has been performed
-- on the input string. (Considering a work flow like:
-- parseQuery >>= mkQuery >>= parseUrl >>= execQuery.)
-- The parser, however, accepts
-- only the URL-encoded characters and, thus, some preprocessing
-- may be necessary.
--
-- The other way round, this means
-- that you may use parentheses, spaces and quotation marks
-- instead of the URL encodings.
-- But be careful! Do not introduce two successive spaces -
-- we do not check for whitespace!
-------------------------------------------------------------------------
preprocess :: String -> String
preprocess = concatMap s2s . map tos
where s2s "(" = "%28"
s2s ")" = "%29"
s2s "\"" = "%22"
s2s " " = "+"
s2s c = c
tos c = [c]
-------------------------------------------------------------------------
-- | Generates the complete query string
-- including URL,
-- query expression,
-- id list and
-- item control
-------------------------------------------------------------------------
mkQuery :: Query -> String
mkQuery q = apiUrl ++ qry ++ x ++ plus ++ apiIdList ++ is ++ itm
where x = case qExp q of
Nothing -> ""
Just e -> exp2str e
plus = case qExp q of
Nothing -> ""
Just _ -> "&"
qry = case qExp q of
Nothing -> ""
Just _ -> apiQuery
is = ids2str $ qIds q
itm = items2str q
-------------------------------------------------------------------------
-- | Converts a list of 'Identifier'
-- to a string with comma-separated identifiers
-------------------------------------------------------------------------
ids2str :: [Identifier] -> String
ids2str = foldr i2s ""
where i2s i [] = i
i2s i s = i ++ "," ++ s
-------------------------------------------------------------------------
-- | Converts the query to a string containing only the item control
-------------------------------------------------------------------------
items2str :: Query -> String
items2str q = itemControl (qStart q) (qItems q)
-------------------------------------------------------------------------
-- | Generates the item control of a query string according
-- to first item and results per page:
--
-- * 'Int': Start index for this page
--
-- * 'Int': Number of results per page.
-------------------------------------------------------------------------
itemControl :: Int -> Int -> String
itemControl s m = "&amp;start=" ++ show s ++
"&amp;max_results=" ++ show m
-- ======================================================================
-- result
-- ======================================================================
------------------------------------------------------------------------
-- | Total results of the query
------------------------------------------------------------------------
totalResults :: [Tag String] -> Int
totalResults = getN "opensearch:totalResults"
------------------------------------------------------------------------
-- | Start index of this page of results
------------------------------------------------------------------------
startIndex :: [Tag String] -> Int
startIndex = getN "opensearch:startIndex"
------------------------------------------------------------------------
-- | Number of items per page
------------------------------------------------------------------------
itemsPerPage :: [Tag String] -> Int
itemsPerPage = getN "opensearch:itemsPerPage"
------------------------------------------------------------------------
-- | Checks if the feed contains an error message, i.e.
--
-- * it has only one entry,
--
-- * the title of this entry is \"Error\" and
--
-- * its id field contains an error message,
-- which is returned as 'Left'.
--
-- Apparently, this function is not necessary,
-- since the Arxiv site returns error feeds
-- with status code 400 ("bad request"),
-- which should be handled by your http library anyway.
------------------------------------------------------------------------
checkForError :: [Tag String] -> Either String ()
checkForError ts = case totalResults ts of
1 -> head $ forEachEntry ts $ \e ->
if getTitle e == "Error"
then Left $ getError e
else Right ()
_ -> Right ()
------------------------------------------------------------------------
-- | Get the first entry in the tag soup.
-- The function returns a tuple of
--
-- * The entry (if any)
--
-- * The rest of the tag soup following the first entry.
------------------------------------------------------------------------
getEntry :: [Tag String] -> ([Tag String],[Tag String])
getEntry = element "entry"
------------------------------------------------------------------------
-- | Loop through all entries in the result feed
-- applying a function on each one.
-- The results are returned as list.
-- The function is similar to 'map'
-- with the arguments swapped (as in Foldable 'forM').
--
-- Arguments:
--
-- * ['Tag' 'String']: The TagSoup through which we are looping
--
-- * ['Tag' 'String'] -> r: The function we are applying per entry;
-- the TagSoup passed in to the function
-- represents the current entry.
--
-- Example:
--
-- > forEachEntry soup $ \e ->
-- > let y = case getYear e of
-- > "" -> "s.a."
-- > x -> x
-- > a = case getAuthorNames e of
-- > [] -> "Anonymous"
-- > as -> head as ++
-- > in a ++ " (" ++ y ++ ")"
--
-- Would retrieve the name of the first author
-- and the year of publication (like "Aaronson (2013)")
-- from all entries.
------------------------------------------------------------------------
forEachEntry :: [Tag String] -> ([Tag String] -> r) -> [r]
forEachEntry = forEach "entry"
------------------------------------------------------------------------
-- | Variant of 'forEachEntry' for monadic actions.
------------------------------------------------------------------------
forEachEntryM :: Monad m =>
[Tag String] -> ([Tag String] -> m r) -> m [r]
forEachEntryM = forEachM "entry"
------------------------------------------------------------------------
-- | Variant of 'forEachEntryM' for actions
-- that do not return a result.
------------------------------------------------------------------------
forEachEntryM_ :: Monad m =>
[Tag String] -> ([Tag String] -> m ()) -> m ()
forEachEntryM_ = forEachM_ "entry"
------------------------------------------------------------------------
-- | Gets the full contents of the id field
-- (which contains an URL before the article identifier).
-- The [Tag String] argument is expected to be a single entry.
------------------------------------------------------------------------
getIdUrl :: [Tag String] -> String
getIdUrl = getString "id"
------------------------------------------------------------------------
-- | Gets the article identifier as it can be used
-- in an \"id_list\" query, i.e. without the URL.
-- The [Tag String] argument is expected to be a single entry.
------------------------------------------------------------------------
getId :: [Tag String] -> String
getId = pureId . getString "id"
------------------------------------------------------------------------
-- Extract the pure article identifier from the id string
------------------------------------------------------------------------
pureId :: String -> String
pureId s = let i = toSlash 2 (reverse s)
z = drop 6 i
in case z of
"" -> reverse i
'.':_ -> reverse $ toSlash 1 i
_ -> reverse i
where toSlash :: Int -> String -> String
toSlash i m = let x = takeWhile (/= '/') m
in if i == 1 then x
else x ++ ('/' :
toSlash (i-1) (drop (length x + 1) m))
------------------------------------------------------------------------
-- Get the error message
------------------------------------------------------------------------
getError :: [Tag String] -> String
getError = pureError . getString "id"
------------------------------------------------------------------------
-- Extract the pure error message from the id string
------------------------------------------------------------------------
pureError :: String -> String
pureError = drop 1 . dropWhile (/= '#')
------------------------------------------------------------------------
-- | Gets the contents of the \"updated\" field in this entry, i.e.
-- the date when the article was last updated.
-- Be aware that there is another \"updated\" field
-- right below the root node of the result.
-- Make sure your are operating on an entry,
-- not on the root node!
------------------------------------------------------------------------
getUpdated :: [Tag String] -> String
getUpdated = getString "updated"
------------------------------------------------------------------------
-- | Gets the contents of the \"published\" field in this entry, i.e.
-- the date when the article was last uploaded.
------------------------------------------------------------------------
getPublished :: [Tag String] -> String
getPublished = getString "published"
------------------------------------------------------------------------
-- | Gets the year of the \"published\" field in this entry.
------------------------------------------------------------------------
getYear :: [Tag String] -> String
getYear sp = case getPublished sp of
"" -> "s.a."
s -> takeWhile (/= '-') s
------------------------------------------------------------------------
-- | Gets the title of this entry.
------------------------------------------------------------------------
getTitle :: [Tag String] -> String
getTitle = getString "title"
------------------------------------------------------------------------
-- | Gets the summary of this entry.
------------------------------------------------------------------------
getSummary :: [Tag String] -> String
getSummary = getString "summary"
------------------------------------------------------------------------
-- | Gets author''s comment (in \"arxiv:comment\") of this entry.
------------------------------------------------------------------------
getComment :: [Tag String] -> String
getComment = getString "arxiv:comment"
------------------------------------------------------------------------
-- | Gets the journal information (in \"arxiv:journal_ref\")
-- of this entry.
------------------------------------------------------------------------
getJournal :: [Tag String] -> String
getJournal = getString "arxiv:journal_ref"
------------------------------------------------------------------------
-- | Gets the digital object identifier (in \"arxiv:doi\")
-- of this entry.
------------------------------------------------------------------------
getDoi :: [Tag String] -> String
getDoi = getString "arxiv:doi"
------------------------------------------------------------------------
-- | The Link data type
------------------------------------------------------------------------
data Link = Link {
-- | The hyperlink
lnkHref :: String,
-- | The link type (a MIME type)
lnkType :: String,
-- | The link title (e.g. \"pdf\" would be the link
-- where we find the article
-- in pdf format)
lnkTitle :: String,
-- | the link relation (e.g. \"related\" would point
-- to the related information,
-- such as the pdf document)
lnkRel :: String}
deriving (Show, Eq)
------------------------------------------------------------------------
-- | Gets all links in the entry.
------------------------------------------------------------------------
getLinks :: [Tag String] -> [Link]
getLinks soup = case element "link" soup of
([],_) -> []
(x:_,[]) -> [mkLink x]
(x:_,rs) -> mkLink x : getLinks rs
where mkLink l = Link {
lnkHref = fromMaybe "" $ getAt "href" l,
lnkTitle = fromMaybe "" $ getAt "title" l,
lnkRel = fromMaybe "" $ getAt "rel" l,
lnkType = fromMaybe "" $ getAt "type" l}
------------------------------------------------------------------------
-- | Gets only the pdf link of this entry (if any).
------------------------------------------------------------------------
getPdfLink :: [Tag String] -> Maybe Link
getPdfLink soup = case getLinks soup of
[] -> Nothing
ls -> find (\l -> lnkTitle l == "pdf") ls
------------------------------------------------------------------------
-- | Gets the hyperlink to the pdf document of this entry (if any).
------------------------------------------------------------------------
getPdf :: [Tag String] -> String
getPdf soup = case getPdfLink soup of
Nothing -> ""
Just l -> lnkHref l
------------------------------------------------------------------------
-- | Category data type
------------------------------------------------------------------------
data Category = Category {
-- | The category term (e.g. \"math-ph\")
catTerm :: String,
-- | The category scheme
catScheme :: String}
deriving (Show, Eq)
------------------------------------------------------------------------
-- Make category from TagSoup
------------------------------------------------------------------------
mkCat :: Tag String -> Category
mkCat c = Category {
catTerm = fromMaybe "" $ getAt "term" c,
catScheme = fromMaybe "" $ getAt "scheme" c}
------------------------------------------------------------------------
-- | Gets the categories of this entry.
------------------------------------------------------------------------
getCategories :: [Tag String] -> [Category]
getCategories soup = case element "category" soup of
([],_) -> []
(x:_,[]) -> [mkCat x]
(x:_,rs) -> mkCat x : getCategories rs
------------------------------------------------------------------------
-- | Gets the primary category of this entry (if any).
------------------------------------------------------------------------
getPrimaryCategory :: [Tag String] -> Maybe Category
getPrimaryCategory soup = case element "arxiv:primary_category" soup of
([],_) -> Nothing
(x:_,_) -> Just (mkCat x)
------------------------------------------------------------------------
-- | The Author data type
------------------------------------------------------------------------
data Author = Author {
-- | Author name
auName :: String,
-- | Author Affiliation
auFil :: String}
deriving (Show, Eq)
------------------------------------------------------------------------
-- | Gets the authors of this entry.
------------------------------------------------------------------------
getAuthors :: [Tag String] -> [Author]
getAuthors soup = case element "author" soup of
([],_) -> []
(xs,rs) -> mkAut xs : getAuthors rs
where mkAut au = let nm = case element "name" au of
([],_) -> ""
(n,_) -> findTxt n
fl = case element "arxiv:affiliation" au of
([],_) -> ""
(a,_) -> findTxt a
in Author {
auName = nm,
auFil = fl}
------------------------------------------------------------------------
-- | Gets the names of all authors of this entry.
------------------------------------------------------------------------
getAuthorNames :: [Tag String] -> [String]
getAuthorNames = go
where go s = case element "author" s of
([],[]) -> []
(a,[]) -> [getString "name" a]
(a,r) -> getString "name" a : go r
------------------------------------------------------------------------
-- Lookup attribute by name
------------------------------------------------------------------------
getAt :: String -> Tag String -> Maybe String
getAt a (TagOpen _ as) = lookup a as
getAt _ _ = Nothing
------------------------------------------------------------------------
-- Find a 'TagText' and return the content
------------------------------------------------------------------------
getString :: String -> [Tag String] -> String
getString n soup = let (i,_) = element n soup
in if null i then "" else findTxt i
------------------------------------------------------------------------
-- Find a 'TagText' and return the contentas a 'Int'.
-- If the tag is not found or the content is not a number,
-- -1 is returned.
------------------------------------------------------------------------
getN :: String -> [Tag String] -> Int
getN key soup = case element key soup of
(k,_) -> case findTxt k of
"" -> -1
t -> if all isDigit t then read t else -1
------------------------------------------------------------------------
-- Get the content of a 'TagText'
------------------------------------------------------------------------
findTxt :: [Tag String] -> String
findTxt [] = ""
findTxt (t:ts) = case t of
TagText x -> x
_ -> findTxt ts
------------------------------------------------------------------------
-- Map a function to all occurences of an element in the soup
------------------------------------------------------------------------
forEach :: String -> [Tag String] -> ([Tag String] -> r) -> [r]
forEach nm soup f = case element nm soup of
([],_) -> []
(e,rs) -> f e : forEach nm rs f
------------------------------------------------------------------------
-- Variant of forEach for monadic actions
------------------------------------------------------------------------
forEachM :: Monad m =>
String -> [Tag String] -> ([Tag String] -> m r) -> m [r]
forEachM nm soup f = case element nm soup of
([],_) -> return []
(e,rs) -> do r <- f e
rr <- forEachM nm rs f
return (r:rr)
------------------------------------------------------------------------
-- Variant of forEachM for actions that do not return anything
------------------------------------------------------------------------
forEachM_ :: Monad m =>
String -> [Tag String] -> ([Tag String] -> m ()) -> m ()
forEachM_ nm soup f = case element nm soup of
([],_) -> return ()
(e,rs) -> f e >> forEachM_ nm rs f
------------------------------------------------------------------------
-- Find occurrence of an element and
-- return this element (open tag to close tag) and
-- the rest of the soup behind the element.
------------------------------------------------------------------------
element :: String -> [Tag String] -> ([Tag String], [Tag String])
element _ [] = ([],[])
element nm (t:ts) | isTagOpenName nm t = let (r,rs) = closeEl 0 ts
in (t:r,rs)
| otherwise = element nm ts
where closeEl :: Int -> [Tag String] -> ([Tag String], [Tag String])
closeEl _ [] = ([],[])
closeEl i (x:xs) = go i (isTagCloseName nm x) x xs
go i b x xs | b && i == 0 = ([x],xs)
| b && i > 0 = let (r,rs) = closeEl (i-1) xs
in (x:r,rs)
| isTagOpenName nm x = let (r,rs) = closeEl (i+1) xs
in (x:r,rs)
| otherwise = let (r,rs) = closeEl i xs
in (x:r,rs)
------------------------------------------------------------------------
-- Expression Parser
------------------------------------------------------------------------
type Parser a = Parsec String () a
------------------------------------------------------------------------
-- Expression is something in parentheses or something
-- that starts with a field
------------------------------------------------------------------------
expression :: Parser Expression
expression = try parentheses <|> fieldOperator
------------------------------------------------------------------------
-- A field potentially followed by an operator
------------------------------------------------------------------------
fieldOperator :: Parser Expression
fieldOperator = do
f <- field
c <- try (char '+') <|> return ' '
if c == ' ' then return f
else opAndArg f
------------------------------------------------------------------------
-- Find an operator and an expression
------------------------------------------------------------------------
opAndArg :: Expression -> Parser Expression
opAndArg f = do
o <- op
void $ char '+'
e <- expression
return (o f e)
------------------------------------------------------------------------
-- A field consists of a fieldId and a list of terms
------------------------------------------------------------------------
field :: Parser Expression
field = do
i <- fieldId
ts <- terms
return (Exp $ i ts)
------------------------------------------------------------------------
-- The field ids
------------------------------------------------------------------------
fieldId :: Parser ([Term] -> Field)
fieldId = try (void (string "au:" ) >> return Au)
<|> try (void (string "ti:" ) >> return Ti)
<|> try (void (string "abs:") >> return Abs)
<|> try (void (string "co:" ) >> return Co)
<|> try (void (string "cat:") >> return Cat)
<|> try (void (string "jr:" ) >> return Jr)
<|> try (void (string "rn:" ) >> return Rn)
<|> try (void (string "id:" ) >> return Id)
<|> (void (string "all:") >> return All)
------------------------------------------------------------------------
-- A term may be quoted,
-- otherwise we build terms as list of strings
-- separated by '+'
------------------------------------------------------------------------
terms :: Parser [String]
terms = do
t <- try quoted <|> term
c <- try (lookAhead anyChar) <|> onEof '%' --ugly
case c of
'%' -> return [t]
'+' -> do x <- isOp
if x then return [t]
else void (char c) >> (t:) <$> terms
_ -> fail $ "unexpected symbol: '" ++ [c] ++ "'"
------------------------------------------------------------------------
-- Checks if an operator follows without consuming input
------------------------------------------------------------------------
isOp :: Parser Bool
isOp = try (void (lookAhead (string "+ANDNOT+")) >> return True)
<|> try (void (lookAhead (string "+AND+")) >> return True)
<|> try (void (lookAhead (string "+OR+")) >> return True)
<|> return False
------------------------------------------------------------------------
-- A quoted term
------------------------------------------------------------------------
quoted :: Parser String
quoted = do
void $ string "%22"
intercalate "+" <$> go
where go = do
t <- term
s <- try (string "%22") <|> return ""
if not (null s) then return [t]
else do c <- anyChar
let t' = if c == '+' then t
else t ++ [c]
(t':) <$> go
------------------------------------------------------------------------
-- A single term
------------------------------------------------------------------------
term :: Parser String
term = do
c <- try (lookAhead anyChar) <|> onEof '%'
if c `elem` "%+"
then return ""
else do x <- char c
(x:) <$> term
------------------------------------------------------------------------
-- Signal EOF by returning the specified char
------------------------------------------------------------------------
onEof :: Char -> Parser Char
onEof c = eof >> return c
------------------------------------------------------------------------
-- An expression in parentheses,
-- which may be followed by another expression
------------------------------------------------------------------------
parentheses :: Parser Expression
parentheses = do
void $ string "%28"
e <- expression
void $ string "%29"
c <- try (char '+') <|> try (lookAhead (char '%')) <|> onEof '.'
case c of
'+' -> opAndArg e
_ -> return e
------------------------------------------------------------------------
-- Parse operator (note that it is essential to
-- to process "ANDNOT" before "AND"!
------------------------------------------------------------------------
op :: Parser (Expression -> Expression -> Expression)
op = try (void (string "ANDNOT") >> return AndNot)
<|> try (void (string "OR") >> return Or)
<|> (void (string "AND") >> return And)
...@@ -41,7 +41,6 @@ packages: ...@@ -41,7 +41,6 @@ packages:
# commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a # commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a
# #
extra-deps: extra-deps:
- arxiv-0.0.3@sha256:02de1114091d11f1f3ab401d104d125ad4301260806feb7f63b3dcefc7db88cf,1588
# Override default flag values for local packages and extra-deps # Override default flag values for local packages and extra-deps
# flags: {} # flags: {}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment