Reconstruct abstract from the index

parent d7283561
{-|
Module : Main
Description : Textmining Collaborative Platform
Copyright : (c) CNRS, 2023-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext@: search, map, share
-}
module Main where module Main where
import Conduit import Conduit
...@@ -55,8 +68,9 @@ fetchWorksC fltr _ = do ...@@ -55,8 +68,9 @@ fetchWorksC fltr _ = do
Right (mCount, c) -> do Right (mCount, c) -> do
putText $ "Count: " <> show mCount putText $ "Count: " <> show mCount
_ <- runConduit $ c _ <- runConduit $ c
.| takeC 1000 .| takeC 3
.| mapM_C (\(OA.Work { .. }) -> do .| mapM_C (\w@(OA.Work { .. }) -> do
liftIO $ putText $ show id <> " :: " <> show display_name liftIO $ putText $ show id <> " :: " <> show display_name
liftIO $ putText abstract_reconstructed
) )
pure () pure ()
...@@ -37,6 +37,7 @@ library ...@@ -37,6 +37,7 @@ library
OpenAlex.Client OpenAlex.Client
OpenAlex.ServantClientLogging OpenAlex.ServantClientLogging
OpenAlex.Types OpenAlex.Types
OpenAlex.Utils
default-extensions: default-extensions:
DataKinds DataKinds
DeriveGeneric DeriveGeneric
......
...@@ -21,6 +21,7 @@ import qualified Data.Text as T ...@@ -21,6 +21,7 @@ import qualified Data.Text as T
import Data.Time (UTCTime) import Data.Time (UTCTime)
import Data.Time.Calendar (Day) import Data.Time.Calendar (Day)
import qualified Data.Time.Format as DTF import qualified Data.Time.Format as DTF
import OpenAlex.Utils (reconstructAbstract)
import Protolude hiding (Location, Meta) import Protolude hiding (Location, Meta)
-- API request types -- API request types
...@@ -201,6 +202,7 @@ instance FromJSON SummaryStats where ...@@ -201,6 +202,7 @@ instance FromJSON SummaryStats where
-- | https://docs.openalex.org/api-entities/works/work-object -- | https://docs.openalex.org/api-entities/works/work-object
data Work = Work data Work = Work
{ abstract_inverted_index :: Maybe (Map Text [Int]) -- TODO { abstract_inverted_index :: Maybe (Map Text [Int]) -- TODO
, abstract_reconstructed :: Text
, authorships :: [Authorship] , authorships :: [Authorship]
, apc_list :: Maybe APCList , apc_list :: Maybe APCList
, apc_paid :: Maybe APCPaid , apc_paid :: Maybe APCPaid
...@@ -242,6 +244,7 @@ data Work = Work ...@@ -242,6 +244,7 @@ data Work = Work
instance FromJSON Work where instance FromJSON Work where
parseJSON = withObject "Work" $ \v -> do parseJSON = withObject "Work" $ \v -> do
abstract_inverted_index <- v .: "abstract_inverted_index" abstract_inverted_index <- v .: "abstract_inverted_index"
let abstract_reconstructed = reconstructAbstract abstract_inverted_index
authorships <- v .: "authorships" authorships <- v .: "authorships"
apc_list <- v .: "apc_list" apc_list <- v .: "apc_list"
apc_paid <- v .: "apc_paid" apc_paid <- v .: "apc_paid"
......
{-|
Module : OpenAlex Utils
Description : Textmining Collaborative Platform
Copyright : (c) CNRS, 2023
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module OpenAlex.Utils where
import qualified Data.Map.Strict as Map
import qualified Data.Text as T
import Protolude
-- | https://docs.openalex.org/api-entities/works/work-object#abstract_inverted_index
-- https://en.wikipedia.org/wiki/Inverted_index
-- For legal reasons, OpenAlex doesn't include abstract full text. But
-- we can reconstruct most of it from the index.
-- The index is of form: { word : [ positions-in-text ] }
reconstructAbstract :: Maybe (Map Text [Int]) -> Text
reconstructAbstract Nothing = ""
reconstructAbstract (Just m) = T.intercalate " " $ snd <$> sort wordPositions
where
wordPositions = concatMap (\(word, positions) -> (\pos -> (pos, word)) <$> positions) $ Map.toList m
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment