Commit 90d6620e authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] Regex error with Duckling

parent f1538012
#!/bin/bash
#stack install --nix --profile --test --fast --no-install-ghc --skip-ghc-check
stack install --nix --test --no-install-ghc --skip-ghc-check
env LANG=C.UTF-8 stack install --nix --test --no-install-ghc --skip-ghc-check
......@@ -6,4 +6,4 @@ LOGFILE=$FOLDER"/"$FILE
mkdir -p $FOLDER
~/.local/bin/gargantext-server --ini gargantext.ini --run Dev +RTS > $LOGFILE 2>&1 & tail -F $LOGFILE # -p
env LANG=en_US.UTF-8 ~/.local/bin/gargantext-server --ini gargantext.ini --run Dev +RTS > $LOGFILE 2>&1 & tail -F $LOGFILE # -p
......@@ -18,7 +18,6 @@ New corpus means either:
module Gargantext.API.Node.Corpus.New
where
import Conduit
import Control.Lens hiding (elements, Empty)
import Data.Aeson
......@@ -38,8 +37,6 @@ import qualified Data.Text.Encoding as TE
-- import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary
import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal)
......@@ -49,8 +46,6 @@ import Gargantext.API.Node.Corpus.Types
import Gargantext.API.Node.Types
import Gargantext.Core (Lang(..){-, allLangs-})
import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
......@@ -64,8 +59,11 @@ import Gargantext.Database.Prelude (hasConfig)
import Gargantext.Database.Query.Table.Node (getNodeWith)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Schema.Node (node_hyperdata)
import qualified Gargantext.Database.GargDB as GargDB
import Gargantext.Prelude
import Gargantext.Prelude.Config (gc_max_docs_parsers)
import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
import qualified Gargantext.Database.GargDB as GargDB
------------------------------------------------------------------------
{-
data Query = Query { query_query :: Text
......@@ -177,6 +175,8 @@ type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
------------------------------------------------------------------------
-- TODO WithQuery also has a corpus id
addToCorpusWithQuery :: FlowCmdM env err m
=> User
-> CorpusId
......@@ -216,8 +216,11 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
-- TODO if cid is folder -> create Corpus
-- if cid is corpus -> add to corpus
-- if cid is root -> create corpus in Private
printDebug "[G.A.N.C.New] getDataText with query" q
eTxts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
let lTxts = lefts eTxts
printDebug "[G.A.N.C.New] eTxts" lTxts
case lTxts of
[] -> do
let txts = rights eTxts
......@@ -241,6 +244,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
}
(err:_) -> do
printDebug "Error: " err
pure $ addEvent "ERROR" (T.pack $ show err) $
JobLog { _scst_succeeded = Just 2
, _scst_failed = Just 1
......
......@@ -20,10 +20,12 @@ module Gargantext.Core.Text.Corpus.Parsers.Date
{-(parse, parseRaw, dateSplit, Year, Month, Day)-}
where
import System.Environment (getEnv)
import Data.Aeson (toJSON, Value)
import Data.Either (Either(..))
import Data.HashMap.Strict as HM hiding (map)
import Data.Text (Text, unpack, splitOn)
import Data.Time (parseTimeOrError, defaultTimeLocale, toGregorian)
import Data.Text (Text, unpack, splitOn, replace)
import Data.Time (defaultTimeLocale, toGregorian, iso8601DateFormat, parseTimeM)
import Data.Time.Clock (UTCTime(..), getCurrentTime)
import Data.Time.LocalTime (utc)
import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
......@@ -33,10 +35,12 @@ import Duckling.Types (Seal(..))
import Duckling.Resolve (fromUTC, Context(Context, referenceTime, locale), DucklingTime(DucklingTime), Options(..))
import Duckling.Types (ResolvedToken(..), ResolvedVal(..))
import Gargantext.Core (Lang(FR,EN))
import Gargantext.Core.Types (DebugMode(..), withDebugMode)
import Gargantext.Prelude
import qualified Data.Aeson as Json
import qualified Data.HashSet as HashSet
import qualified Duckling.Core as DC
import qualified Control.Exception as CE
import qualified Data.Aeson as Json
import qualified Data.HashSet as HashSet
import qualified Duckling.Core as DC
------------------------------------------------------------------------
-- | Parse date to Ints
......@@ -61,31 +65,55 @@ type Day = Int
-- | Date Parser
-- Parses dates mentions in full text given the language.
-- >>> parseDate FR (pack "10 avril 1979 à 19H")
-- 1979-04-10 19:00:00 UTC
-- >>> parseDate EN (pack "April 10 1979")
-- 1979-04-10 00:00:00 UTC
-- >>> parse FR (pack "10 avril 1900 à 19H")
-- 1900-04-10 19:00:00 UTC
-- >>> parse EN (pack "April 10 1900")
-- 1900-04-10 00:00:00 UTC
parse :: Lang -> Text -> IO UTCTime
parse lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
parse lang s = do
dateStr' <- parseRawSafe lang s
case dateFlow dateStr' of
DateFlowSuccess ok -> pure ok
_ -> withDebugMode (DebugMode True)
"[G.C.T.P.T.Date parse]" (lang,s)
$ getCurrentTime
defaultDate :: Text
defaultDate = "0-0-0T0:0:0"
type DateFormat = Text
type DateDefault = Text
parseDate' :: DateFormat
-> DateDefault
-> Lang
-> Text
-> IO UTCTime
parseDate' format def lang s = do
dateStr' <- parseRaw lang s
if dateStr' == ""
then getCurrentTime
else do
let dateStr = unpack
$ maybe def identity
$ head
$ splitOn "." dateStr'
pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
data DateFlow = DucklingSuccess { ds_result :: Text }
| DucklingFailure { df_result :: Text }
| ReadFailure1 { rf1_result :: Text }
| ReadFailure2 { rf2_result :: Text }
| DateFlowSuccess { success :: UTCTime }
| DateFlowFailure
deriving Show
--{-
dateFlow :: DateFlow -> DateFlow
dateFlow (DucklingSuccess res) = case (head $ splitOn "." res) of
Nothing -> dateFlow (ReadFailure1 res)
Just re -> case readDate res of
Nothing -> dateFlow (ReadFailure1 re)
Just ok -> DateFlowSuccess ok
dateFlow (DucklingFailure txt) = case readDate $ replace " " "T" txt of
Nothing -> dateFlow (ReadFailure1 txt)
Just ok -> DateFlowSuccess ok
dateFlow (ReadFailure1 txt) = case readDate txt of
Nothing -> DateFlowFailure
Just ok -> DateFlowSuccess ok
dateFlow _ = DateFlowFailure
--}
readDate :: Text -> Maybe UTCTime
readDate txt = do
let format = cs $ iso8601DateFormat (Just "%H:%M:%S")
parseTimeM True defaultTimeLocale (unpack format) (cs txt)
-- TODO add Paris at Duckling.Locale Region datatype
......@@ -93,8 +121,8 @@ parseDate' format def lang s = do
-- TODO : put this in a more generic place in the source code
parserLang :: Lang -> DC.Lang
parserLang FR = DC.FR
parserLang EN = DC.EN
parserLang _ = panic "not implemented"
parserLang EN = DC.EN
parserLang lang = panic $ "[G.C.T.C.P.Date] Lang not implemented" <> (cs $ show lang)
-- | Final Date parser API
-- IO can be avoided here:
......@@ -102,6 +130,21 @@ parserLang _ = panic "not implemented"
-- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
-- parseRaw :: Context -> Text -> SomeErrorHandling Text
parseRawSafe :: Lang -> Text -> IO DateFlow
parseRawSafe lang text = do
triedParseRaw <- tryParseRaw lang text
dateStr' <- case triedParseRaw of
Left (CE.SomeException err) -> do
envLang <- getEnv "LANG"
printDebug "[G.C.T.C.P.Date] Exception: " (err, envLang, lang, text)
pure $ DucklingFailure text
Right res -> pure $ DucklingSuccess res
pure dateStr'
tryParseRaw :: CE.Exception e => Lang -> Text -> IO (Either e Text)
tryParseRaw lang text = CE.try (parseRaw lang text)
parseRaw :: Lang -> Text -> IO Text
parseRaw lang text = do -- case result
maybeResult <- extractValue <$> getTimeValue
......@@ -146,5 +189,3 @@ parseDateWithDuckling lang input options = do
-- TODO check/test Options False or True
pure $ analyze input contxt options $ HashSet.fromList [(Seal Time)]
......@@ -16,6 +16,7 @@ commentary with @some markup@.
module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, module Gargantext.Database.Admin.Types.Node
, DebugMode(..), withDebugMode
, Term, Terms(..)
, TokenTag(..), POS(..), NER(..)
, Label, Stems
......@@ -29,6 +30,7 @@ module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
import Control.Lens (Prism', (#), makeLenses, over)
import Control.Monad.Except (MonadError(throwError))
import Debug.Trace (trace)
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import Data.Hashable (Hashable)
......@@ -47,6 +49,14 @@ import Gargantext.Database.Admin.Types.Node
import Gargantext.Prelude
import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
------------------------------------------------------------------------
data DebugMode = DebugMode { activated :: Bool }
withDebugMode :: (Show a) => DebugMode -> Text -> a -> b -> b
withDebugMode (DebugMode True ) msg var a = trace (cs $ "DEBUG" <> msg <> (cs $ show var)) a
withDebugMode (DebugMode False) _ _ a = a
------------------------------------------------------------------------
data Ordering = Down | Up
deriving (Enum, Show, Eq, Bounded)
......
......@@ -26,6 +26,7 @@ Portability : POSIX
module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
( DataText(..)
, getDataText
, getDataText_Debug
, flowDataText
, flow
......@@ -67,6 +68,8 @@ import System.FilePath (FilePath)
import qualified Data.HashMap.Strict as HashMap
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
import qualified Data.Map as Map
import qualified Data.Conduit.List as CL
import qualified Data.Conduit as C
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
import Gargantext.Core (Lang(..), PosTagAlgo(..))
......@@ -134,6 +137,13 @@ data DataText = DataOld ![NodeId]
| DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
-- | DataNew ![[HyperdataDocument]]
-- Show instance is not possible because of IO
printDataText :: DataText -> IO ()
printDataText (DataOld xs) = putStrLn $ show xs
printDataText (DataNew (maybeInt, conduitData)) = do
res <- C.runConduit (conduitData .| CL.consume)
putStrLn $ show (maybeInt, res)
-- TODO use the split parameter in config file
getDataText :: FlowCmdM env err m
=> DataOrigin
......@@ -153,6 +163,20 @@ getDataText (InternalOrigin _) _la q _li = do
ids <- map fst <$> searchDocInDatabase cId (stemIt q)
pure $ Right $ DataOld ids
getDataText_Debug :: FlowCmdM env err m
=> DataOrigin
-> TermType Lang
-> API.Query
-> Maybe API.Limit
-> m ()
getDataText_Debug a l q li = do
result <- getDataText a l q li
case result of
Left err -> liftBase $ putStrLn $ show err
Right res -> liftBase $ printDataText res
-------------------------------------------------------------------------------
flowDataText :: forall env err m.
( FlowCmdM env err m
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment