Commit 00dc93a0 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TextFlow REFACT] addin ExtractedNgrams type for simple or enriched ngrams extraction

parent 8f0fcd75
Pipeline #1333 failed with stage
......@@ -12,11 +12,12 @@ Portability : POSIX
module Gargantext.Core
where
import Gargantext.Prelude
import GHC.Generics (Generic)
import Data.Aeson
import Data.Either(Either(Left))
import Data.Hashable (Hashable)
import Data.Swagger
import GHC.Generics (Generic)
import Gargantext.Prelude
import Servant.API
------------------------------------------------------------------------
......@@ -47,6 +48,8 @@ instance FromHttpApiData Lang
parseUrlPiece "FR" = pure FR
parseUrlPiece "All" = pure All
parseUrlPiece _ = Left "Unexpected value of OrderBy"
instance Hashable Lang
allLangs :: [Lang]
allLangs = [minBound ..]
......@@ -64,10 +67,11 @@ instance HasDBid Lang where
fromDBid 2 = EN
fromDBid _ = panic "HasDBid lang, not implemented"
------------------------------------------------------------------------
data PostTagAlgo = CoreNLP
deriving (Show, Read)
deriving (Show, Read, Eq, Ord, Generic)
instance Hashable PostTagAlgo
instance HasDBid PostTagAlgo where
toDBid CoreNLP = 1
......
......@@ -36,6 +36,7 @@ module Gargantext.Core.Text.Terms
import Control.Lens
import Data.HashMap.Strict (HashMap)
import Data.Hashable (Hashable)
import Data.Map (Map)
import Data.Text (Text)
import Data.Traversable
......@@ -45,7 +46,6 @@ import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Data.Text as Text
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
import Gargantext.Core
import Gargantext.Core.Flow.Types
......@@ -57,9 +57,11 @@ import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Core.Text.Terms.Multi (multiterms)
import Gargantext.Core.Types
import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..), ngramsTerms, text2ngrams)
import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag(..), insertNgramsPostag)
import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..), ngramsTerms, text2ngrams, NgramsId)
import Gargantext.Prelude
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
data TermType lang
= Mono { _tt_lang :: !lang }
......@@ -111,26 +113,44 @@ withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
withLang l _ = l
------------------------------------------------------------------------
data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams }
| EnrichedNgrams { unEnrichedNgrams :: NgramsPostag }
deriving (Eq, Ord, Generic)
instance Hashable ExtractedNgrams
class ExtractNgramsT h
where
extractNgramsT :: HasText h
=> TermType Lang
-> h
-> Cmd err (HashMap Ngrams (Map NgramsType Int))
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
------------------------------------------------------------------------
cleanExtractedNgrams :: Int -> ExtractedNgrams -> ExtractedNgrams
cleanExtractedNgrams s (SimpleNgrams ng)
| Text.length (ng ^. ngramsTerms) < s = SimpleNgrams ng
| otherwise = SimpleNgrams $ text2ngrams (Text.take s (ng ^. ngramsTerms))
cleanExtractedNgrams s _ = undefined
extracted2ngrams :: ExtractedNgrams -> Ngrams
extracted2ngrams (SimpleNgrams ng) = ng
extracted2ngrams _ = undefined
filterNgrams :: Int -> HashMap Ngrams (Map NgramsType Int)
-> HashMap Ngrams (Map NgramsType Int)
filterNgrams s = HashMap.mapKeys filter
where
filter ng
| Text.length (ng ^. ngramsTerms) < s = ng
| otherwise = text2ngrams (Text.take s (ng ^. ngramsTerms))
isSimpleNgrams :: ExtractedNgrams -> Bool
isSimpleNgrams (SimpleNgrams _) = True
isSimpleNgrams _ = False
insertExtractedNgrams :: [ ExtractedNgrams ] -> Cmd err (HashMap Text NgramsId)
insertExtractedNgrams ngs = do
let (s, e) = List.partition isSimpleNgrams ngs
m1 <- insertNgrams (map unSimpleNgrams s)
m2 <- insertNgramsPostag (map unEnrichedNgrams e)
pure $ m1 <> m2
-- =======================================================
------------------------------------------------------------------------
-- | Terms from Text
-- Mono : mono terms
-- Multi : multi terms
......@@ -147,15 +167,6 @@ terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (
------------------------------------------------------------------------
text2term :: Lang -> [Text] -> Terms
text2term _ [] = Terms [] Set.empty
text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure)
<$> ("!?(),;." :: String)
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: remove IO
......@@ -175,6 +186,8 @@ termsUnsupervised (Unsupervised l n s m) =
. uniText
termsUnsupervised _ = undefined
newTries :: Int -> Text -> Tries Token ()
newTries n t = buildTries n (fmap toToken $ uniText t)
......@@ -185,3 +198,12 @@ uniText = map (List.filter (not . isPunctuation))
. sentences -- TODO get sentences according to lang
. Text.toLower
text2term :: Lang -> [Text] -> Terms
text2term _ [] = Terms [] Set.empty
text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure)
<$> ("!?(),;." :: String)
......@@ -25,25 +25,19 @@ Source: https://en.wikipedia.org/wiki/Part-of-speech_tagging
module Gargantext.Core.Text.Terms.Multi.PosTagging
where
import GHC.Generics
import Data.ByteString.Lazy.Internal (ByteString)
import Data.Aeson.TH (deriveJSON)
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import Data.ByteString.Lazy.Internal (ByteString)
import Data.Set (fromList)
import Data.String.Conversions (ConvertibleStrings)
import Data.Text (Text, splitOn, pack, toLower)
import GHC.Generics
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
import Network.HTTP.Simple
import Data.String.Conversions (ConvertibleStrings)
------------------------------------------------------------------------
------------------------------------------------------------------------
data Token = Token { _tokenIndex :: Int
......
......@@ -31,6 +31,7 @@ import Control.Lens (Prism', (#), makeLenses, over)
import Control.Monad.Except (MonadError(throwError))
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import Data.Hashable (Hashable)
import Data.Maybe
import Data.Monoid
import Data.Semigroup
......@@ -40,11 +41,11 @@ import Data.Swagger (ToSchema(..))
import Data.Text (Text, unpack)
import Data.Validity
import GHC.Generics
import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
import Gargantext.Core.Types.Main
import Gargantext.Database.Admin.Types.Node
import Gargantext.Core.Utils.Prefix (unPrefix, wellNamedSchema)
import Gargantext.Database.Admin.Types.Node
import Gargantext.Prelude
import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
------------------------------------------------------------------------
data Ordering = Down | Up
......@@ -74,7 +75,7 @@ data POS = NP
| JJ | VB
| CC | IN | DT
| NoPos
deriving (Show, Generic, Eq)
deriving (Show, Generic, Eq, Ord)
------------------------------------------------------------------------
instance FromJSON POS where
parseJSON = withText "String" (\x -> pure (pos $ unpack x))
......@@ -94,10 +95,11 @@ instance FromJSON POS where
pos "IN" = IN
pos "DT" = DT
-- French specific
pos "P" = IN
pos "P" = IN
pos _ = NoPos
instance ToJSON POS
instance Hashable POS
------------------------------------------------------------------------
data NER = PERSON | ORGANIZATION | LOCATION | NoNER
deriving (Show, Generic)
......
......@@ -263,13 +263,15 @@ insertMasterDocs c lang hs = do
-- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs :: HashMap Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
<- mapNodeIdNgrams
<$> documentIdWithNgrams
(extractNgramsT $ withLang lang documentsWithId)
documentsWithId
terms2id <- insertNgrams $ HashMap.keys mapNgramsDocs
terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
-- to be removed
let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
......@@ -372,28 +374,28 @@ mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap
------------------------------------------------------------------------
instance ExtractNgramsT HyperdataContact
where
extractNgramsT l hc = filterNgrams 255 <$> extract l hc
extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
where
extract :: TermType Lang -> HyperdataContact
-> Cmd err (HashMap Ngrams (Map NgramsType Int))
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
extract _l hc' = do
let authors = map text2ngrams
$ maybe ["Nothing"] (\a -> [a])
$ view (hc_who . _Just . cw_lastName) hc'
pure $ HashMap.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
instance ExtractNgramsT HyperdataDocument
where
extractNgramsT :: TermType Lang
-> HyperdataDocument
-> Cmd err (HashMap Ngrams (Map NgramsType Int))
extractNgramsT lang hd = filterNgrams 255 <$> extractNgramsT' lang hd
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
where
extractNgramsT' :: TermType Lang
-> HyperdataDocument
-> Cmd err (HashMap Ngrams (Map NgramsType Int))
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
extractNgramsT' lang' doc = do
let source = text2ngrams
$ maybe "Nothing" identity
......@@ -412,10 +414,10 @@ instance ExtractNgramsT HyperdataDocument
<$> concat
<$> liftBase (extractTerms lang' $ hasText doc)
pure $ HashMap.fromList $ [(source, Map.singleton Sources 1)]
<> [(i', Map.singleton Institutes 1) | i' <- institutes ]
<> [(a', Map.singleton Authors 1) | a' <- authors ]
<> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
pure $ HashMap.fromList $ [(SimpleNgrams source, Map.singleton Sources 1)]
<> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
<> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
<> [(SimpleNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
where
......
......@@ -22,10 +22,12 @@ module Gargantext.Database.Query.Table.Ngrams
where
import Control.Lens ((^.))
import Data.HashMap.Strict (HashMap)
import Data.ByteString.Internal (ByteString)
import Data.Map (Map, fromList)
import Data.Text (Text)
import qualified Database.PostgreSQL.Simple as PGS
import qualified Data.HashMap.Strict as HashMap
import Gargantext.Core.Types
import Gargantext.Database.Prelude (runOpaQuery, Cmd)
......@@ -64,8 +66,8 @@ _dbGetNgramsDb = runOpaQuery queryNgramsTable
-- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
insertNgrams :: [Ngrams] -> Cmd err (Map Text NgramsId)
insertNgrams ns = fromList <$> map (\(Indexed i t) -> (t, i)) <$> (insertNgrams' ns)
insertNgrams :: [Ngrams] -> Cmd err (HashMap Text NgramsId)
insertNgrams ns = HashMap.fromList <$> map (\(Indexed i t) -> (t, i)) <$> (insertNgrams' ns)
-- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
insertNgrams' :: [Ngrams] -> Cmd err [Indexed Int Text]
......
......@@ -16,16 +16,30 @@ Portability : POSIX
module Gargantext.Database.Query.Table.NgramsPostag
where
import Data.HashMap.Strict (HashMap)
import Data.Hashable (Hashable)
import Data.Text (Text)
import GHC.Generics (Generic)
import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Database.Prelude (Cmd, runPGSQuery)
import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Schema.Prelude
import Gargantext.Database.Types
import Gargantext.Prelude
import qualified Data.HashMap.Strict as HashMap
import qualified Database.PostgreSQL.Simple as PGS
data NgramsPostag = NgramsPostag { _np_lang :: Lang
, _np_algo :: PostTagAlgo
, _np_postag :: POS
, _np_form :: Ngrams
, _np_lem :: Ngrams
}
deriving (Eq, Ord, Generic)
instance Hashable NgramsPostag
type NgramsPostagInsert = ( Int
, Int
......@@ -36,9 +50,24 @@ type NgramsPostagInsert = ( Int
, Int
)
insertNgramsPostag :: [NgramsPostagInsert] -> Cmd err [Indexed Int Text]
insertNgramsPostag ns = runPGSQuery queryInsertNgramsPostag (PGS.Only $ Values fields ns)
toInsert :: NgramsPostag -> NgramsPostagInsert
toInsert (NgramsPostag l a p form lem) =
( toDBid l
, toDBid a
, cs $ show p
, _ngramsTerms form
, _ngramsSize form
, _ngramsTerms lem
, _ngramsSize lem
)
insertNgramsPostag :: [NgramsPostag] -> Cmd err (HashMap Text NgramsId)
insertNgramsPostag ns = HashMap.fromList
<$> map (\(Indexed t i) -> (t,i))
<$> insertNgramsPostag' (map toInsert ns)
insertNgramsPostag' :: [NgramsPostagInsert] -> Cmd err [Indexed Text Int]
insertNgramsPostag' ns = runPGSQuery queryInsertNgramsPostag (PGS.Only $ Values fields ns)
where
fields = map (\t -> QualifiedIdentifier Nothing t) $ snd fields_name
......
......@@ -19,6 +19,7 @@ Ngrams connection to the Database.
module Gargantext.Database.Schema.Ngrams
where
import Data.HashMap.Strict (HashMap)
import Data.Hashable (Hashable)
import Codec.Serialise (Serialise())
import Control.Lens (over)
......@@ -35,6 +36,7 @@ import Text.Read (read)
import Gargantext.Database.Types
import Gargantext.Database.Schema.Prelude
import qualified Database.PostgreSQL.Simple as PGS
import qualified Data.HashMap.Strict as HashMap
type NgramsId = Int
......@@ -178,19 +180,19 @@ instance Functor NgramsT where
fmap = over ngramsT
-----------------------------------------------------------------------
withMap :: Map Text NgramsId -> Text -> NgramsId
withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
withMap :: HashMap Text NgramsId -> Text -> NgramsId
withMap m n = maybe (panic "withMap: should not happen") identity (HashMap.lookup n m)
indexNgramsT :: Map Text NgramsId -> NgramsT Ngrams -> NgramsT (Indexed Int Ngrams)
indexNgramsT :: HashMap Text NgramsId -> NgramsT Ngrams -> NgramsT (Indexed Int Ngrams)
indexNgramsT = fmap . indexNgramsWith . withMap
-- | TODO replace NgramsT whith Typed NgramsType Ngrams
indexTypedNgrams :: Map Text NgramsId
indexTypedNgrams :: HashMap Text NgramsId
-> Typed NgramsType Ngrams
-> Typed NgramsType (Indexed Int Ngrams)
indexTypedNgrams = fmap . indexNgramsWith . withMap
indexNgrams :: Map Text NgramsId -> Ngrams -> Indexed Int Ngrams
indexNgrams :: HashMap Text NgramsId -> Ngrams -> Indexed Int Ngrams
indexNgrams = indexNgramsWith . withMap
indexNgramsWith :: (Text -> NgramsId) -> Ngrams -> Indexed Int Ngrams
......
......@@ -49,7 +49,7 @@ data PosTag = PosTag { unPosTag :: Text }
| NER { unNER :: Text } -- TODO
------------------------------------------------------------------------
type NgramsPostag = NgramsPostagPoly (Maybe Int) Lang PostTagAlgo (Maybe PosTag) NgramsTerm NgramsTerm (Maybe Int)
-- type NgramsPostag = NgramsPostagPoly (Maybe Int) Lang PostTagAlgo (Maybe PosTag) NgramsTerm NgramsTerm (Maybe Int)
type NgramsPostagDB = NgramsPostagPoly (Maybe Int) Int Int (Maybe Text) Int Int Int
------------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment