diff --git a/package.yaml b/package.yaml index 1572d9ecbddb407b69624461b53b5e4cc83096e1..8af9fce0fc891ffb640e353ffe232f2d49c182b7 100644 --- a/package.yaml +++ b/package.yaml @@ -28,6 +28,12 @@ dependencies: - http-client - text - containers +- lens +- bytestring +- utf8-string +- split +- scientific +- vector library: source-dirs: src diff --git a/src/HAL/Client.hs b/src/HAL/Client.hs index 2081a35225eaff8b6f7cde7b696b77dcdd44a6df..438a615e4c53dd0c2f1d99750f7fbec07f2e431a 100644 --- a/src/HAL/Client.hs +++ b/src/HAL/Client.hs @@ -2,6 +2,7 @@ {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE TypeOperators #-} {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TemplateHaskell #-} module HAL.Client where @@ -12,8 +13,18 @@ import Servant.Client hiding (Response) import Data.Text import Data.Map import Data.Aeson +import qualified Data.ByteString.Lazy as BSL +import qualified Codec.Binary.UTF8.String as UTF -type HALAPI = Search +import Control.Lens as L (makeLenses) + +type HALAPI = Search :<|> Structure + +type Structure = "ref" :> "structure" + :> QueryParam "fq" Text + :> QueryParam "fl" Text + :> QueryParam "rows" Int + :> Get '[JSON] Response type Search = "search" -- fl determine which fields will be returned it can be a list of fields or * @@ -42,13 +53,30 @@ asc = Just . Asc desc :: Text -> Maybe SortField desc = Just . Desc +newtype Doc = Doc (Map Text Value) + deriving (Generic) + +instance Eq Doc where + (==) (Doc doc) (Doc doc') = (doc ! "docid") == (doc' ! "docid") + +instance Show Doc where + show (Doc o) = (UTF.decode $ BSL.unpack $ encode $ o ! "label_s") + <> "(" + <> (show . encode $ o ! "docid") + <> ")" + +instance FromJSON Doc +instance ToJSON Doc + -- Response type data Response = Response { - numFound :: Integer, - start :: Int, - docs :: [Doc] + _numFound :: Integer, + _start :: Int, + _docs :: [Doc] } deriving (Show, Generic) +L.makeLenses ''Response + instance FromJSON Response where parseJSON (Object o) = Response <$> @@ -56,17 +84,18 @@ instance FromJSON Response where <*> ((o .: "response") >>= (.: "start")) <*> ((o .: "response") >>= (.: "docs")) -newtype Doc = Doc (Map Text Value) - deriving (Show, Generic) -instance FromJSON Doc halAPI :: Proxy HALAPI halAPI = Proxy -search :: [Text] - -> [Text] - -> Maybe SortField - -> Maybe Int - -> Maybe Int +structure :: Maybe Text -- fq + -> Maybe Text + -> Maybe Int -- rows + -> ClientM Response +search :: [Text] -- fl + -> [Text] -- fq + -> Maybe SortField -- sort + -> Maybe Int -- start + -> Maybe Int -- rows -> ClientM Response -search = client halAPI +(search :<|> structure) = client halAPI diff --git a/src/Tree.hs b/src/Tree.hs new file mode 100644 index 0000000000000000000000000000000000000000..e68fb4711479355ad6f1e302b4a43de2d5f5770b --- /dev/null +++ b/src/Tree.hs @@ -0,0 +1,103 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE TypeOperators #-} +{-# LANGUAGE OverloadedStrings #-} + +module Tree where + +import GHC.Generics +import Network.HTTP.Client (newManager) +import Network.HTTP.Client.TLS (tlsManagerSettings) +import Servant.Client hiding (Response) +import Control.Lens.Getter ((^.)) +import Data.Map ((!?), (!), insert, empty, Map, fromList, toList) +import qualified Data.Map as M +import Data.Map.Internal (merge, preserveMissing, zipWithMatched) +import Data.Text (pack, Text) +import Data.Aeson (Value(..), ToJSON, encode) +import Data.List (groupBy, isInfixOf) +import Data.List.Split (chunksOf) +import Data.Either (rights) +import Data.Maybe (fromMaybe) +import Data.Scientific (Scientific, floatingOrInteger) +import qualified Data.Vector as V +import Text.Printf + +import HAL.Client + +runHalAPIClient :: ClientM Response -> IO (Either ServantError Response) +runHalAPIClient cmd = do + manager' <- newManager tlsManagerSettings + runClientM cmd (mkClientEnv manager' $ BaseUrl Https "api.archives-ouvertes.fr" 443 "") + +docid :: Doc -> Value +docid (Doc doc) = doc ! "docid" + +formatParentIdRequest :: [Doc] -> Maybe Text +formatParentIdRequest [] = Nothing +formatParentIdRequest (x:[]) = Just . pack . show $ docid x +formatParentIdRequest (x:xs) = + (Just . pack . show $ docid x) + <> (Just " || ") + <> formatParentIdRequest xs + + +runStructureRequest :: Maybe Text -> IO (Either ServantError Response) +runStructureRequest rq = + runHalAPIClient $ structure rq (Just "docid, parentDocid_i, label_s") (Just 10000) + +scientific2text :: Scientific -> Text +scientific2text n = + case floatingOrInteger n of + Left r -> pack $ show (r :: Double) + Right i -> pack $ show (i :: Integer) + +ds2Child :: [Doc] -> IO [Doc] +ds2Child ds = do + rs <- sequence $ runStructureRequest <$> formatedRequests ds + return . concat $ (^. docs) <$> rights rs + where formatedRequest docs' = Just "parentDocid_i:(" <> formatParentIdRequest docs' <> Just ")" + formatedRequests docs'' = formatedRequest <$> chunksOf 100 docs'' + +fetchChildren :: [Doc] -> IO [Doc] +fetchChildren [] = pure [] +fetchChildren ds = (ds <>) <$> (fetchChildren =<< ds2Child ds) + +isChildOf :: Doc -> Doc -> Bool +isChildOf (Doc d) (Doc d') = not . null $ V.filter (\id -> id == (scientific2text docId)) ((\(String a) -> a) <$> parentDocIds) + where (Number docId ) = (d ! "docid") + (Array parentDocIds) = (d' ! "parentDocid_i") + +data DocTree = DocTree Doc Int [DocTree] + deriving (Show, Generic) +instance ToJSON DocTree + +buildTree :: Int -> [Doc] -> Doc -> DocTree +buildTree depth docs id = DocTree id depth (buildTree (depth + 1) docs <$> children) + where children = filter (isChildOf id) docs + +formatTree :: DocTree -> String +formatTree tree@(DocTree doc depth children) = + printf "%s%s\n%s" (addSpace) (show doc) (concat $ formatTree <$> children) + where addSpace + | depth > 0 = "├" <> (replicate (depth * depth + depth) '─') + | otherwise = "🌲" + +findDeepest :: Map Scientific Int -> DocTree -> Map Scientific Int +findDeepest m tree@(DocTree doc depth children) = + mergeMap map1 maps + where map1 = insert docid' depth m + maps = foldl mergeMap empty $ findDeepest m <$> children + mergeMap m1 m2 = merge preserveMissing preserveMissing (zipWithMatched whenMatch) m1 m2 + whenMatch k v v' = if v > v' then v else v' + (Number docid') = docid doc + +isDeep :: Map Scientific Int -> DocTree -> Bool +isDeep m (DocTree i depth _) = depth >= (fromMaybe 0 $ M.lookup id m) + where (Number id) = docid i + +removeDuplicate :: Map Scientific Int -> DocTree -> DocTree +removeDuplicate deepMap tree@(DocTree doc depth children) = DocTree doc depth (removeDuplicate deepMap <$> filter (isDeep deepMap) children) + +noDuplicateTree :: DocTree -> DocTree +noDuplicateTree tree = removeDuplicate (findDeepest M.empty tree) tree