Commit 9cfea968 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] parse to hyperdataDocuments (todo: optim).

parent 62c8870d
...@@ -66,7 +66,6 @@ toTree' m n = ...@@ -66,7 +66,6 @@ toTree' m n =
TreeN (toNodeTree n) $ TreeN (toNodeTree n) $
m ^.. at (Just $ dt_nodeId n) . _Just . each . to (toTree' m) m ^.. at (Just $ dt_nodeId n) . _Just . each . to (toTree' m)
------------------------------------------------------------------------ ------------------------------------------------------------------------
toNodeTree :: DbTreeNode -> NodeTree toNodeTree :: DbTreeNode -> NodeTree
toNodeTree (DbTreeNode nId tId _ n) = NodeTree n nodeType nId toNodeTree (DbTreeNode nId tId _ n) = NodeTree n nodeType nId
......
...@@ -102,7 +102,7 @@ $(deriveJSON (unPrefix "hyperdataDocumentV3_") ''HyperdataDocumentV3) ...@@ -102,7 +102,7 @@ $(deriveJSON (unPrefix "hyperdataDocumentV3_") ''HyperdataDocumentV3)
------------------------------------------------------------------------ ------------------------------------------------------------------------
data HyperdataDocument = HyperdataDocument { _hyperdataDocument_bdd :: Maybe Text data HyperdataDocument = HyperdataDocument { _hyperdataDocument_bdd :: Maybe Text
, _hyperdataDocument_doi :: Maybe Int , _hyperdataDocument_doi :: Maybe Text
, _hyperdataDocument_url :: Maybe Text , _hyperdataDocument_url :: Maybe Text
, _hyperdataDocument_uniqId :: Maybe Text , _hyperdataDocument_uniqId :: Maybe Text
, _hyperdataDocument_page :: Maybe Int , _hyperdataDocument_page :: Maybe Int
...@@ -113,11 +113,11 @@ data HyperdataDocument = HyperdataDocument { _hyperdataDocument_bdd ...@@ -113,11 +113,11 @@ data HyperdataDocument = HyperdataDocument { _hyperdataDocument_bdd
, _hyperdataDocument_publication_date :: Maybe Text , _hyperdataDocument_publication_date :: Maybe Text
, _hyperdataDocument_publication_year :: Maybe Int , _hyperdataDocument_publication_year :: Maybe Int
, _hyperdataDocument_publication_month :: Maybe Int , _hyperdataDocument_publication_month :: Maybe Int
, _hyperdataDocument_publication_day :: Maybe Int
, _hyperdataDocument_publication_hour :: Maybe Int , _hyperdataDocument_publication_hour :: Maybe Int
, _hyperdataDocument_publication_minute :: Maybe Int , _hyperdataDocument_publication_minute :: Maybe Int
, _hyperdataDocument_publication_second :: Maybe Int , _hyperdataDocument_publication_second :: Maybe Int
, _hyperdataDocument_language_iso2 :: Maybe Text , _hyperdataDocument_language_iso2 :: Maybe Text
, _hyperdataDocument_language_iso3 :: Maybe Text
} deriving (Show, Generic) } deriving (Show, Generic)
$(deriveJSON (unPrefix "_hyperdataDocument_") ''HyperdataDocument) $(deriveJSON (unPrefix "_hyperdataDocument_") ''HyperdataDocument)
$(makeLenses ''HyperdataDocument) $(makeLenses ''HyperdataDocument)
...@@ -128,7 +128,7 @@ instance ToField HyperdataDocument where ...@@ -128,7 +128,7 @@ instance ToField HyperdataDocument where
toHyperdataDocuments :: [(Text, Text)] -> [HyperdataDocument] toHyperdataDocuments :: [(Text, Text)] -> [HyperdataDocument]
toHyperdataDocuments ts = map (\(t1,t2) -> HyperdataDocument Nothing Nothing Nothing Nothing Nothing (Just t1) toHyperdataDocuments ts = map (\(t1,t2) -> HyperdataDocument Nothing Nothing Nothing Nothing Nothing (Just t1)
Nothing (Just t2) Nothing Nothing Nothing Nothing (Just t2) Nothing Nothing Nothing
Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing
) ts ) ts
hyperdataDocuments :: [HyperdataDocument] hyperdataDocuments :: [HyperdataDocument]
......
...@@ -20,13 +20,17 @@ please follow the types. ...@@ -20,13 +20,17 @@ please follow the types.
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE PackageImports #-} {-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers (parse, FileFormat(..), clean) module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
where where
import System.FilePath (FilePath(), takeExtension) import System.FilePath (FilePath(), takeExtension)
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries) import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Control.Monad (join)
import Data.Time (UTCTime(..))
import qualified Data.Time as DT
import Data.Either.Extra (partitionEithers) import Data.Either.Extra (partitionEithers)
import Data.List (concat) import Data.List (concat)
import qualified Data.Map as DM import qualified Data.Map as DM
...@@ -44,10 +48,15 @@ import Control.Concurrent.Async as CCA (mapConcurrently) ...@@ -44,10 +48,15 @@ import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Text.Encoding (decodeUtf8) import Data.Text.Encoding (decodeUtf8)
import Data.String (String()) import Data.String (String())
import Data.List (lookup)
------------------------------------------------------------------------ ------------------------------------------------------------------------
import Gargantext.Core (Lang(..))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Text.Parsers.WOS (wosParser) import Gargantext.Text.Parsers.WOS (wosParser)
import Gargantext.Text.Parsers.Date (parseDate)
import Gargantext.Text.Terms.Stop (detectLang)
------------------------------------------------------------------------ ------------------------------------------------------------------------
type ParseError = String type ParseError = String
...@@ -61,7 +70,10 @@ type ParseError = String ...@@ -61,7 +70,10 @@ type ParseError = String
-- | According to the format of Input file, -- | According to the format of Input file,
-- different parser are available. -- different parser are available.
data FileFormat = WOS -- Implemented (ISI Format) data FileFormat = WOS
deriving (Show)
-- Implemented (ISI Format)
-- | DOC -- Not Implemented / import Pandoc -- | DOC -- Not Implemented / import Pandoc
-- | ODT -- Not Implemented / import Pandoc -- | ODT -- Not Implemented / import Pandoc
-- | PDF -- Not Implemented / pdftotext and import Pandoc ? -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
...@@ -71,6 +83,57 @@ data FileFormat = WOS -- Implemented (ISI Format) ...@@ -71,6 +83,57 @@ data FileFormat = WOS -- Implemented (ISI Format)
-- TODO: to debug maybe add the filepath in error message -- TODO: to debug maybe add the filepath in error message
-- | Parse file into documents
-- TODO manage errors here
parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseDocs format path = do
docs <- snd <$> parse format path
mapM (toDoc format) docs
type Year = Int
type Month = Int
type Day = Int
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
parseDate' _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
parseDate' l (Just txt) = do
utcTime <- parseDate l txt
let (UTCTime day _) = utcTime
let (y,m,d) = DT.toGregorian day
pure (Just utcTime, (Just (fromIntegral y),Just m,Just d))
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc format d = do
let abstract = lookup "abstract" d
let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
(utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang dateToParse
pure $ HyperdataDocument (Just $ DT.pack $ show format)
(lookup "doi" d)
(lookup "URL" d)
Nothing
Nothing
(lookup "title" d)
(lookup "authors" d)
(lookup "source" d)
(lookup "abstract" d)
(fmap (DT.pack . show) utcTime)
(pub_year)
(pub_month)
(pub_day)
Nothing
Nothing
Nothing
(Just $ (DT.pack . show) lang)
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]]) parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse format path = do parse format path = do
files <- case takeExtension path of files <- case takeExtension path of
......
...@@ -55,7 +55,7 @@ doc2hyperdataDocument :: Doc -> HyperdataDocument ...@@ -55,7 +55,7 @@ doc2hyperdataDocument :: Doc -> HyperdataDocument
--doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) = --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) = doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
HyperdataDocument (Just "CSV") HyperdataDocument (Just "CSV")
(Just did) (Just . pack . show $ did)
Nothing Nothing
Nothing Nothing
Nothing Nothing
......
...@@ -21,7 +21,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00" ...@@ -21,7 +21,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw) where module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw) where
import Data.HashMap.Strict as HM hiding (map) import Data.HashMap.Strict as HM hiding (map)
import Data.Text (Text, unpack, splitOn) import Data.Text (Text, unpack, splitOn, pack)
import Data.Time (parseTimeOrError, defaultTimeLocale) import Data.Time (parseTimeOrError, defaultTimeLocale)
import Data.Time.Clock (UTCTime, getCurrentTime) import Data.Time.Clock (UTCTime, getCurrentTime)
import Data.Time.LocalTime (utc) import Data.Time.LocalTime (utc)
...@@ -82,15 +82,16 @@ parserLang EN = DC.EN ...@@ -82,15 +82,16 @@ parserLang EN = DC.EN
-- parseDateRaw :: Context -> Text -> SomeErrorHandling Text -- parseDateRaw :: Context -> Text -> SomeErrorHandling Text
-- TODO error handling -- TODO error handling
parseDateRaw :: Lang -> Text -> IO Text parseDateRaw :: Lang -> Text -> IO (Text)
parseDateRaw lang text = do parseDateRaw lang text = do
maybeJson <- map jsonValue <$> parseDateWithDuckling lang text maybeJson <- map jsonValue <$> parseDateWithDuckling lang text
case headMay maybeJson of case headMay maybeJson of
Just (Json.Object object) -> case HM.lookup "value" object of Just (Json.Object object) -> case HM.lookup "value" object of
Just (Json.String date) -> pure date Just (Json.String date) -> pure date
Just _ -> panic "ParseDateRaw ERROR: should be a json String" Just _ -> panic "ParseDateRaw ERROR: should be a json String"
Nothing -> panic "ParseDateRaw ERROR: no date found" Nothing -> panic $ "ParseDateRaw ERROR: no date found" <> (pack . show) lang <> " " <> text
_ -> panic "ParseDateRaw ERROR: type error"
_ -> panic $ "ParseDateRaw ERROR: type error" <> (pack . show) lang <> " " <> text
-- | Current Time in DucklingTime format -- | Current Time in DucklingTime format
......
...@@ -28,10 +28,12 @@ import Data.Attoparsec.ByteString (Parser, try, string ...@@ -28,10 +28,12 @@ import Data.Attoparsec.ByteString (Parser, try, string
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine) import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString, concat) import Data.ByteString (ByteString, concat)
import Data.ByteString.Char8 (pack) import Data.ByteString.Char8 (pack)
import Control.Applicative import Control.Applicative
--import Gargantext.Types -------------------------------------------------------------
-- | wosParser parses ISI format from -- | wosParser parses ISI format from
-- Web Of Science Database -- Web Of Science Database
...@@ -48,7 +50,7 @@ notice = start *> fields <* end ...@@ -48,7 +50,7 @@ notice = start *> fields <* end
where where
start :: Parser ByteString start :: Parser ByteString
start = "\nPT " *> takeTill isEndOfLine start = "\nPT " *> takeTill isEndOfLine
end :: Parser [Char] end :: Parser [Char]
end = manyTill anyChar (string $ pack "\nER\n") end = manyTill anyChar (string $ pack "\nER\n")
...@@ -75,11 +77,12 @@ lines = many line ...@@ -75,11 +77,12 @@ lines = many line
translate :: ByteString -> ByteString translate :: ByteString -> ByteString
translate champs translate champs
| champs == "AU" = "author" | champs == "AF" = "authors"
| champs == "TI" = "title" | champs == "TI" = "title"
| champs == "SO" = "source" | champs == "SO" = "source"
| champs == "DI" = "doi" | champs == "DI" = "doi"
| champs == "PD" = "publication_date" | champs == "PD" = "publication_date"
| champs == "AB" = "abstract" | champs == "AB" = "abstract"
| otherwise = champs | otherwise = champs
-------------------------------------------------------------
...@@ -15,7 +15,7 @@ Main type here is String. ...@@ -15,7 +15,7 @@ Main type here is String.
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Stop module Gargantext.Text.Terms.Stop (detectLang, detectLangs, stopList)
where where
import GHC.Base (Functor) import GHC.Base (Functor)
...@@ -31,6 +31,7 @@ import qualified Data.Map.Strict as DM ...@@ -31,6 +31,7 @@ import qualified Data.Map.Strict as DM
import Data.String (String) import Data.String (String)
import Data.Text (Text)
import Data.Text (pack, unpack) import Data.Text (pack, unpack)
import Data.Tuple.Extra (both) import Data.Tuple.Extra (both)
...@@ -90,8 +91,14 @@ data LangWord = LangWord Lang Word ...@@ -90,8 +91,14 @@ data LangWord = LangWord Lang Word
type LangProba = Map Lang Double type LangProba = Map Lang Double
------------------------------------------------------------------------ ------------------------------------------------------------------------
detectLangs :: String -> [(Lang, Double)] detectLang :: Text -> Maybe Lang
detectLangs s = DL.reverse $ DL.sortOn snd detectLang = head . map fst . detectLangs
detectLangs :: Text -> [(Lang, Double)]
detectLangs = detectLangs' . unpack
detectLangs' :: String -> [(Lang, Double)]
detectLangs' s = DL.reverse $ DL.sortOn snd
$ toList $ toList
$ detect (wordsToBook [0..2] s) eventLang $ detect (wordsToBook [0..2] s) eventLang
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment