Commit 1dbd9392 authored by mzheng's avatar mzheng

renaming Corpus to Document for clarification

parent 3665ccda
...@@ -8,7 +8,7 @@ import Data.Text qualified as T ...@@ -8,7 +8,7 @@ import Data.Text qualified as T
import HAL (getMetadataWithCursorOptsC, countResultsOpts', HalCrawlerOptions(..), defaultHalOptions, getMetadataWith, generateRequestByStructID) import HAL (getMetadataWithCursorOptsC, countResultsOpts', HalCrawlerOptions(..), defaultHalOptions, getMetadataWith, generateRequestByStructID)
import HAL.Types import HAL.Types
import HAL.Doc import HAL.Doc
import HAL.Doc.Corpus (Corpus(..)) import HAL.Doc.Document (Document(..))
import Network.HTTP.Client (newManager) import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings) import Network.HTTP.Client.TLS (tlsManagerSettings)
import Options.Applicative import Options.Applicative
...@@ -63,13 +63,13 @@ main = do run =<< execParser opts ...@@ -63,13 +63,13 @@ main = do run =<< execParser opts
-- case res of -- case res of
-- (Left err) -> print err -- (Left err) -> print err
-- (Right val) -> do -- (Right val) -> do
-- mapM_ printCorpus $ _docs val -- mapM_ printDocument $ _docs val
-- this function is for debug purpose -- | This function is for debug purpose
printCorpus :: MonadIO m => Corpus -> m () printDocument :: Document -> IO ()
printCorpus Corpus { .. } = do printDocument Document { .. } = do
putText $ "StructId: [" <> T.intercalate ", " (map (T.pack . show) _corpus_struct_id) <> "]" putText $ "StructId: [" <> T.intercalate ", " (map (T.pack . show) _document_struct_id) <> "]"
putText $ "Authors affiliations: [" <> T.intercalate ", " _corpus_authors_affiliations <> "]" putText $ "Authors affiliations: [" <> T.intercalate ", " _document_authors_affiliations <> "]"
putText "------------" putText "------------"
...@@ -88,16 +88,16 @@ run (Fetch (FetchParams { fp_query, fp_limit, fp_lang })) = do ...@@ -88,16 +88,16 @@ run (Fetch (FetchParams { fp_query, fp_limit, fp_lang })) = do
Right (_cnt, docsC) -> do Right (_cnt, docsC) -> do
_ <- runConduit $ _ <- runConduit $
docsC docsC
.| mapM_C printCorpus .| mapM_C printDocument
.| sinkList .| sinkList
pure () pure ()
where where
opts = defaultHalOptions { _hco_debugLogs = True } opts = defaultHalOptions { _hco_debugLogs = True }
printCorpus Corpus { .. } = do printDocument Document { .. } = do
putText $ "docid: " <> _corpus_docid <> " [" <> (T.intercalate " " _corpus_title) <> "]" putText $ "docid: " <> _document_docid <> " [" <> (T.intercalate " " _document_title) <> "]"
putText $ " " <> (T.intercalate " " _corpus_abstract) putText $ " " <> (T.intercalate " " _document_abstract)
putText $ " " <> show _corpus_abstract_lang_map putText $ " " <> show _document_abstract_lang_map
putText $ " " <> show _corpus_original putText $ " " <> show _document_original
putText "------------" putText "------------"
......
...@@ -29,7 +29,7 @@ library ...@@ -29,7 +29,7 @@ library
HAL HAL
HAL.Client HAL.Client
HAL.Doc HAL.Doc
HAL.Doc.Corpus HAL.Doc.Document
HAL.Doc.EntityTree HAL.Doc.EntityTree
HAL.Doc.Struct HAL.Doc.Struct
HAL.Types HAL.Types
......
...@@ -6,7 +6,7 @@ import Data.Aeson ( FromJSON ) ...@@ -6,7 +6,7 @@ import Data.Aeson ( FromJSON )
import Data.LanguageCodes (ISO639_1(..)) import Data.LanguageCodes (ISO639_1(..))
import Data.Text qualified as T import Data.Text qualified as T
import HAL.Client ( SortField(Asc), search, structure, searchCursor ) import HAL.Client ( SortField(Asc), search, structure, searchCursor )
import HAL.Doc.Corpus ( Corpus ) import HAL.Doc.Document ( Document )
import HAL.Doc.Struct ( Struct ) import HAL.Doc.Struct ( Struct )
import HAL.Types (Response(..)) import HAL.Types (Response(..))
import HAL.Utils (langAbstractS, toText) import HAL.Utils (langAbstractS, toText)
...@@ -48,7 +48,7 @@ getMetadataWithCursorC :: Query ...@@ -48,7 +48,7 @@ getMetadataWithCursorC :: Query
-- rows returned. -- rows returned.
-> Maybe ISO639_1 -> Maybe ISO639_1
-- ^ An optional language for the search. -- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ())) -> IO (Either ClientError (Maybe Count, ConduitT () Document IO ()))
getMetadataWithCursorC = getMetadataWithCursorOptsC defaultHalOptions getMetadataWithCursorC = getMetadataWithCursorOptsC defaultHalOptions
-- | Fetch metadata using cursors -- | Fetch metadata using cursors
...@@ -62,7 +62,7 @@ getMetadataWithCursorOptsC :: HalCrawlerOptions ...@@ -62,7 +62,7 @@ getMetadataWithCursorOptsC :: HalCrawlerOptions
-- rows returned. -- rows returned.
-> Maybe ISO639_1 -> Maybe ISO639_1
-- ^ An optional language for the search. -- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ())) -> IO (Either ClientError (Maybe Count, ConduitT () Document IO ()))
getMetadataWithCursorOptsC opts@HalCrawlerOptions { .. } q mb_limit lang = do getMetadataWithCursorOptsC opts@HalCrawlerOptions { .. } q mb_limit lang = do
-- Basically this works as follows: -- Basically this works as follows:
-- - fetch first page with cursor = "*" -- - fetch first page with cursor = "*"
...@@ -78,7 +78,7 @@ getMetadataWithCursorOptsC opts@HalCrawlerOptions { .. } q mb_limit lang = do ...@@ -78,7 +78,7 @@ getMetadataWithCursorOptsC opts@HalCrawlerOptions { .. } q mb_limit lang = do
fq = queryWithLang lang fq = queryWithLang lang
get' :: Count get' :: Count
-> (Maybe Count, ConduitT () Corpus IO ()) -> (Maybe Count, ConduitT () Document IO ())
get' numFound' = get' numFound' =
( Just numResults ( Just numResults
, producer "*" , producer "*"
...@@ -89,7 +89,7 @@ getMetadataWithCursorOptsC opts@HalCrawlerOptions { .. } q mb_limit lang = do ...@@ -89,7 +89,7 @@ getMetadataWithCursorOptsC opts@HalCrawlerOptions { .. } q mb_limit lang = do
limit = min numFound' $ fromMaybe numFound' mb_limit limit = min numFound' $ fromMaybe numFound' mb_limit
numResults = limit numResults = limit
producer :: Text -> ConduitT () Corpus IO () producer :: Text -> ConduitT () Document IO ()
producer cursor = do producer cursor = do
let endpoint = searchCursor (Just q) (Just $ requestedFields lang) fq sort_ (Just $ fromIntegral _hco_batchSize) (Just cursor) let endpoint = searchCursor (Just q) (Just $ requestedFields lang) fq sort_ (Just $ fromIntegral _hco_batchSize) (Just cursor)
liftIO $ debugLog opts $ "[getMetadataWithCursorLangC] producer: " <> show cursor liftIO $ debugLog opts $ "[getMetadataWithCursorLangC] producer: " <> show cursor
...@@ -116,7 +116,7 @@ countResultsOpts' opts q lang = do ...@@ -116,7 +116,7 @@ countResultsOpts' opts q lang = do
-- Set rows=0 to query number of results -- Set rows=0 to query number of results
-- https://api.archives-ouvertes.fr/docs/search#rows -- https://api.archives-ouvertes.fr/docs/search#rows
-- First, estimate the total number of documents -- First, estimate the total number of documents
eRes <- runHalAPIClient opts $ search (Just q) (Just $ requestedFields Nothing) fq Nothing (Just 0) (Just 0) :: IO (Either ClientError (Response Corpus)) eRes <- runHalAPIClient opts $ search (Just q) (Just $ requestedFields Nothing) fq Nothing (Just 0) (Just 0) :: IO (Either ClientError (Response Document))
pure (fromIntegral . _numFound <$> eRes) pure (fromIntegral . _numFound <$> eRes)
where where
fq = queryWithLang lang fq = queryWithLang lang
...@@ -135,7 +135,7 @@ getMetadataWith :: [Query] ...@@ -135,7 +135,7 @@ getMetadataWith :: [Query]
-- rows returned. -- rows returned.
-> Maybe ISO639_1 -> Maybe ISO639_1
-- ^ An optional language for the search. -- ^ An optional language for the search.
-> IO (Either ClientError (Response Corpus)) -> IO (Either ClientError (Response Document))
getMetadataWith qs start_ limit lang = do getMetadataWith qs start_ limit lang = do
runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields lang) [] Nothing start_ (fromIntegral <$> limit) runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields lang) [] Nothing start_ (fromIntegral <$> limit)
where where
...@@ -152,7 +152,7 @@ getMetadataWithC :: [Query] ...@@ -152,7 +152,7 @@ getMetadataWithC :: [Query]
-- rows returned. -- rows returned.
-> Maybe ISO639_1 -> Maybe ISO639_1
-- ^ An optional language for the search. -- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ())) -> IO (Either ClientError (Maybe Count, ConduitT () Document IO ()))
getMetadataWithC qs start_ limit lang = getMetadataWithOptsC defaultHalOptions (qs <> queryWithLang lang) start_ limit lang getMetadataWithC qs start_ limit lang = getMetadataWithOptsC defaultHalOptions (qs <> queryWithLang lang) start_ limit lang
getMetadataWithOptsC :: HalCrawlerOptions getMetadataWithOptsC :: HalCrawlerOptions
...@@ -166,14 +166,14 @@ getMetadataWithOptsC :: HalCrawlerOptions ...@@ -166,14 +166,14 @@ getMetadataWithOptsC :: HalCrawlerOptions
-- rows returned. -- rows returned.
-> Maybe ISO639_1 -> Maybe ISO639_1
-- ^ An optional language for the search. -- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ())) -> IO (Either ClientError (Maybe Count, ConduitT () Document IO ()))
getMetadataWithOptsC opts@HalCrawlerOptions { .. } qs mb_offset mb_limit lang = do getMetadataWithOptsC opts@HalCrawlerOptions { .. } qs mb_offset mb_limit lang = do
-- First, estimate the total number of documents -- First, estimate the total number of documents
eCount <- countResults qs eCount <- countResults qs
pure $ get' <$> eCount pure $ get' <$> eCount
where where
get' :: Count get' :: Count
-> (Maybe Count, ConduitT () Corpus IO ()) -> (Maybe Count, ConduitT () Document IO ())
get' numFound' = get' numFound' =
( Just numResults ( Just numResults
, yieldMany [0..] , yieldMany [0..]
...@@ -188,7 +188,7 @@ getMetadataWithOptsC opts@HalCrawlerOptions { .. } qs mb_offset mb_limit lang = ...@@ -188,7 +188,7 @@ getMetadataWithOptsC opts@HalCrawlerOptions { .. } qs mb_offset mb_limit lang =
numResults = limit - fromIntegral offset numResults = limit - fromIntegral offset
numPages = numResults `div` fromIntegral _hco_batchSize + 1 numPages = numResults `div` fromIntegral _hco_batchSize + 1
getPage :: Start -> Int -> IO [Corpus] getPage :: Start -> Int -> IO [Document]
getPage start' pageNum = do getPage start' pageNum = do
let offset = start' + pageNum * _hco_batchSize let offset = start' + pageNum * _hco_batchSize
debugLog opts $ "[getMetadataWithLangC] getPage: " <> show offset debugLog opts $ "[getMetadataWithLangC] getPage: " <> show offset
...@@ -208,7 +208,7 @@ countResults qs = do ...@@ -208,7 +208,7 @@ countResults qs = do
-- Set rows=0 to query number of results -- Set rows=0 to query number of results
-- https://api.archives-ouvertes.fr/docs/search#rows -- https://api.archives-ouvertes.fr/docs/search#rows
-- First, estimate the total number of documents -- First, estimate the total number of documents
eRes <- runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields Nothing) [] Nothing (Just 0) (Just 0) :: IO (Either ClientError (Response Corpus)) eRes <- runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields Nothing) [] Nothing (Just 0) (Just 0) :: IO (Either ClientError (Response Document))
pure (fromIntegral . _numFound <$> eRes) pure (fromIntegral . _numFound <$> eRes)
where where
q = joinQueries qs q = joinQueries qs
...@@ -258,7 +258,7 @@ runStructureRequest :: Maybe Text -> IO (Either ClientError (Response Struct)) ...@@ -258,7 +258,7 @@ runStructureRequest :: Maybe Text -> IO (Either ClientError (Response Struct))
runStructureRequest rq = runStructureRequest rq =
runHalAPIClient defaultHalOptions $ structure (Just structFields) rq (Just 10000) runHalAPIClient defaultHalOptions $ structure (Just structFields) rq (Just 10000)
runSearchRequest :: [Text] -> IO (Either ClientError (Response Corpus)) runSearchRequest :: [Text] -> IO (Either ClientError (Response Document))
runSearchRequest rq = runSearchRequest rq =
runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields Nothing) [] Nothing Nothing Nothing runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields Nothing) [] Nothing Nothing Nothing
where where
......
module HAL.Doc module HAL.Doc
( module HAL.Doc.EntityTree ( module HAL.Doc.EntityTree
, module HAL.Doc.Corpus ) , module HAL.Doc.Document )
where where
import HAL.Doc.EntityTree import HAL.Doc.EntityTree
import HAL.Doc.Corpus import HAL.Doc.Document
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveGeneric #-}
module HAL.Doc.Corpus where module HAL.Doc.Document where
import Control.Lens qualified as L import Control.Lens qualified as L
import Data.Aeson ( (.:), (.:?), withObject, FromJSON(parseJSON), Object ) import Data.Aeson ( (.:), (.:?), withObject, FromJSON(parseJSON), Object )
...@@ -17,50 +17,50 @@ import qualified Data.Text as T ...@@ -17,50 +17,50 @@ import qualified Data.Text as T
import Data.Aeson.Types (Parser) import Data.Aeson.Types (Parser)
import Prelude qualified as P import Prelude qualified as P
data Corpus = Corpus data Document = Document
{ _corpus_docid :: Text { _document_docid :: Text
, _corpus_title :: [Text] , _document_title :: [Text]
, _corpus_abstract :: [Text] , _document_abstract :: [Text]
, _corpus_abstract_lang_map :: Map ISO639_1 [Text] , _document_abstract_lang_map :: Map ISO639_1 [Text]
, _corpus_date :: Maybe Text , _document_date :: Maybe Text
, _corpus_source :: Maybe Text , _document_source :: Maybe Text
, _corpus_authors_names :: [Text] , _document_authors_names :: [Text]
, _corpus_authors_affiliations :: [Text] , _document_authors_affiliations :: [Text]
, _corpus_struct_id :: [Int] , _document_struct_id :: [Int]
, _corpus_original :: Object , _document_original :: Object
} deriving (Show, Generic) } deriving (Show, Generic)
L.makeLenses ''Corpus L.makeLenses ''Document
instance Default Corpus where instance Default Document where
def = Corpus "default Id" def def def def def def def def mempty def = Document "default Id" def def def def def def def def mempty
instance FromJSON Corpus where instance FromJSON Document where
parseJSON = withObject "Corpus" $ \o -> do parseJSON = withObject "Document" $ \o -> do
_corpus_docid <- o .: "docid" _document_docid <- o .: "docid"
_corpus_title <- o .: "title_s" <|> return [] _document_title <- o .: "title_s" <|> return []
_corpus_abstract <- o .: "en_abstract_s" <|> return [] _document_abstract <- o .: "en_abstract_s" <|> return []
_corpus_date <- o .:? "submittedDate_s" _document_date <- o .:? "submittedDate_s"
_corpus_source <- o .:? "source_s" _document_source <- o .:? "source_s"
_corpus_authors_names <- o .: "authFullName_s" <|> return [] _document_authors_names <- o .: "authFullName_s" <|> return []
idsNames <- o .:? "structIdName_fs" :: Parser (Maybe [Text]) --unsparsed (contains a _FacetSep_) idsNames <- o .:? "structIdName_fs" :: Parser (Maybe [Text]) --unsparsed (contains a _FacetSep_)
let structIdname = getStructIdsNames idsNames let structIdname = getStructIdsNames idsNames
let _corpus_struct_id = map fst structIdname let _document_struct_id = map fst structIdname
let _corpus_authors_affiliations = map snd structIdname let _document_authors_affiliations = map snd structIdname
abstracts <- abstracts <-
mapM (\lang -> do mapM (\lang -> do
ma <- o .:? fromString (T.unpack $ langAbstractS lang) ma <- o .:? fromString (T.unpack $ langAbstractS lang)
pure $ (\a -> (lang, a)) <$> ma) allLangs pure $ (\a -> (lang, a)) <$> ma) allLangs
let _corpus_abstract_lang_map = Map.fromList $ catMaybes abstracts let _document_abstract_lang_map = Map.fromList $ catMaybes abstracts
let _corpus_original = o let _document_original = o
pure $ Corpus { .. } pure $ Document { .. }
-- | this function parses the field structIdName_fs that looks like : -- | Parses the field structIdName_fs that looks like :
-- > StructId_FacetSep_StructName -- > StructId_FacetSep_StructName
-- --
-- returns [(StructId, StructName)] -- returns [(StructId, StructName)]
...@@ -71,5 +71,5 @@ getStructIdsNames (Just idsNames) = map (\tab -> (P.read (T.unpack (P.head tab)) ...@@ -71,5 +71,5 @@ getStructIdsNames (Just idsNames) = map (\tab -> (P.read (T.unpack (P.head tab))
splitInstitutes = P.map (T.splitOn (T.pack "_FacetSep_")) splitInstitutes = P.map (T.splitOn (T.pack "_FacetSep_"))
getStructIdsNames Nothing = [] getStructIdsNames Nothing = []
instance ToHttpApiData Corpus where instance ToHttpApiData Document where
toUrlPiece _ = "docid,title_s,en_abstract_s,fr_abstract_s,submittedDate_s,source_s,authFullName_s,structId_i,structIdName_fs" toUrlPiece _ = "docid,title_s,en_abstract_s,fr_abstract_s,submittedDate_s,source_s,authFullName_s,structId_i,structIdName_fs"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment