Commit 9cfea968 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] parse to hyperdataDocuments (todo: optim).

parent 62c8870d
......@@ -66,7 +66,6 @@ toTree' m n =
TreeN (toNodeTree n) $
m ^.. at (Just $ dt_nodeId n) . _Just . each . to (toTree' m)
------------------------------------------------------------------------
toNodeTree :: DbTreeNode -> NodeTree
toNodeTree (DbTreeNode nId tId _ n) = NodeTree n nodeType nId
......
......@@ -102,7 +102,7 @@ $(deriveJSON (unPrefix "hyperdataDocumentV3_") ''HyperdataDocumentV3)
------------------------------------------------------------------------
data HyperdataDocument = HyperdataDocument { _hyperdataDocument_bdd :: Maybe Text
, _hyperdataDocument_doi :: Maybe Int
, _hyperdataDocument_doi :: Maybe Text
, _hyperdataDocument_url :: Maybe Text
, _hyperdataDocument_uniqId :: Maybe Text
, _hyperdataDocument_page :: Maybe Int
......@@ -113,11 +113,11 @@ data HyperdataDocument = HyperdataDocument { _hyperdataDocument_bdd
, _hyperdataDocument_publication_date :: Maybe Text
, _hyperdataDocument_publication_year :: Maybe Int
, _hyperdataDocument_publication_month :: Maybe Int
, _hyperdataDocument_publication_day :: Maybe Int
, _hyperdataDocument_publication_hour :: Maybe Int
, _hyperdataDocument_publication_minute :: Maybe Int
, _hyperdataDocument_publication_second :: Maybe Int
, _hyperdataDocument_language_iso2 :: Maybe Text
, _hyperdataDocument_language_iso3 :: Maybe Text
, _hyperdataDocument_language_iso2 :: Maybe Text
} deriving (Show, Generic)
$(deriveJSON (unPrefix "_hyperdataDocument_") ''HyperdataDocument)
$(makeLenses ''HyperdataDocument)
......@@ -128,7 +128,7 @@ instance ToField HyperdataDocument where
toHyperdataDocuments :: [(Text, Text)] -> [HyperdataDocument]
toHyperdataDocuments ts = map (\(t1,t2) -> HyperdataDocument Nothing Nothing Nothing Nothing Nothing (Just t1)
Nothing (Just t2) Nothing Nothing Nothing
Nothing Nothing Nothing Nothing Nothing Nothing
Nothing Nothing Nothing Nothing Nothing Nothing
) ts
hyperdataDocuments :: [HyperdataDocument]
......
......@@ -20,13 +20,17 @@ please follow the types.
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers (parse, FileFormat(..), clean)
module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
where
import System.FilePath (FilePath(), takeExtension)
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Control.Monad (join)
import Data.Time (UTCTime(..))
import qualified Data.Time as DT
import Data.Either.Extra (partitionEithers)
import Data.List (concat)
import qualified Data.Map as DM
......@@ -44,10 +48,15 @@ import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Text.Encoding (decodeUtf8)
import Data.String (String())
import Data.List (lookup)
------------------------------------------------------------------------
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Text.Parsers.WOS (wosParser)
import Gargantext.Text.Parsers.Date (parseDate)
import Gargantext.Text.Terms.Stop (detectLang)
------------------------------------------------------------------------
type ParseError = String
......@@ -61,7 +70,10 @@ type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
data FileFormat = WOS -- Implemented (ISI Format)
data FileFormat = WOS
deriving (Show)
-- Implemented (ISI Format)
-- | DOC -- Not Implemented / import Pandoc
-- | ODT -- Not Implemented / import Pandoc
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
......@@ -71,6 +83,57 @@ data FileFormat = WOS -- Implemented (ISI Format)
-- TODO: to debug maybe add the filepath in error message
-- | Parse file into documents
-- TODO manage errors here
parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseDocs format path = do
docs <- snd <$> parse format path
mapM (toDoc format) docs
type Year = Int
type Month = Int
type Day = Int
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
parseDate' _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
parseDate' l (Just txt) = do
utcTime <- parseDate l txt
let (UTCTime day _) = utcTime
let (y,m,d) = DT.toGregorian day
pure (Just utcTime, (Just (fromIntegral y),Just m,Just d))
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc format d = do
let abstract = lookup "abstract" d
let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
(utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang dateToParse
pure $ HyperdataDocument (Just $ DT.pack $ show format)
(lookup "doi" d)
(lookup "URL" d)
Nothing
Nothing
(lookup "title" d)
(lookup "authors" d)
(lookup "source" d)
(lookup "abstract" d)
(fmap (DT.pack . show) utcTime)
(pub_year)
(pub_month)
(pub_day)
Nothing
Nothing
Nothing
(Just $ (DT.pack . show) lang)
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse format path = do
files <- case takeExtension path of
......
......@@ -55,7 +55,7 @@ doc2hyperdataDocument :: Doc -> HyperdataDocument
--doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
HyperdataDocument (Just "CSV")
(Just did)
(Just . pack . show $ did)
Nothing
Nothing
Nothing
......
......@@ -21,7 +21,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw) where
import Data.HashMap.Strict as HM hiding (map)
import Data.Text (Text, unpack, splitOn)
import Data.Text (Text, unpack, splitOn, pack)
import Data.Time (parseTimeOrError, defaultTimeLocale)
import Data.Time.Clock (UTCTime, getCurrentTime)
import Data.Time.LocalTime (utc)
......@@ -82,15 +82,16 @@ parserLang EN = DC.EN
-- parseDateRaw :: Context -> Text -> SomeErrorHandling Text
-- TODO error handling
parseDateRaw :: Lang -> Text -> IO Text
parseDateRaw :: Lang -> Text -> IO (Text)
parseDateRaw lang text = do
maybeJson <- map jsonValue <$> parseDateWithDuckling lang text
case headMay maybeJson of
Just (Json.Object object) -> case HM.lookup "value" object of
Just (Json.String date) -> pure date
Just _ -> panic "ParseDateRaw ERROR: should be a json String"
Nothing -> panic "ParseDateRaw ERROR: no date found"
_ -> panic "ParseDateRaw ERROR: type error"
Nothing -> panic $ "ParseDateRaw ERROR: no date found" <> (pack . show) lang <> " " <> text
_ -> panic $ "ParseDateRaw ERROR: type error" <> (pack . show) lang <> " " <> text
-- | Current Time in DucklingTime format
......
......@@ -28,10 +28,12 @@ import Data.Attoparsec.ByteString (Parser, try, string
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString, concat)
import Data.ByteString.Char8 (pack)
import Control.Applicative
--import Gargantext.Types
-------------------------------------------------------------
-- | wosParser parses ISI format from
-- Web Of Science Database
......@@ -48,7 +50,7 @@ notice = start *> fields <* end
where
start :: Parser ByteString
start = "\nPT " *> takeTill isEndOfLine
end :: Parser [Char]
end = manyTill anyChar (string $ pack "\nER\n")
......@@ -75,11 +77,12 @@ lines = many line
translate :: ByteString -> ByteString
translate champs
| champs == "AU" = "author"
| champs == "AF" = "authors"
| champs == "TI" = "title"
| champs == "SO" = "source"
| champs == "DI" = "doi"
| champs == "PD" = "publication_date"
| champs == "AB" = "abstract"
| otherwise = champs
-------------------------------------------------------------
......@@ -15,7 +15,7 @@ Main type here is String.
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Stop
module Gargantext.Text.Terms.Stop (detectLang, detectLangs, stopList)
where
import GHC.Base (Functor)
......@@ -31,6 +31,7 @@ import qualified Data.Map.Strict as DM
import Data.String (String)
import Data.Text (Text)
import Data.Text (pack, unpack)
import Data.Tuple.Extra (both)
......@@ -90,8 +91,14 @@ data LangWord = LangWord Lang Word
type LangProba = Map Lang Double
------------------------------------------------------------------------
detectLangs :: String -> [(Lang, Double)]
detectLangs s = DL.reverse $ DL.sortOn snd
detectLang :: Text -> Maybe Lang
detectLang = head . map fst . detectLangs
detectLangs :: Text -> [(Lang, Double)]
detectLangs = detectLangs' . unpack
detectLangs' :: String -> [(Lang, Double)]
detectLangs' s = DL.reverse $ DL.sortOn snd
$ toList
$ detect (wordsToBook [0..2] s) eventLang
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment