Commit 5a2df841 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[DB][Flow] question about the map of ngramsT.

parent 2d561cb0
...@@ -24,9 +24,11 @@ authors ...@@ -24,9 +24,11 @@ authors
module Gargantext.Database.Flow (flowDatabase) module Gargantext.Database.Flow (flowDatabase)
where where
import GHC.Show (Show)
import System.FilePath (FilePath) import System.FilePath (FilePath)
import Data.Maybe (Maybe(..), catMaybes) import Data.Maybe (Maybe(..), catMaybes)
import Data.Text (Text) import Data.Text (Text, splitOn)
import Data.Map (Map) import Data.Map (Map)
import Data.Tuple.Extra (both, second) import Data.Tuple.Extra (both, second)
import qualified Data.Map as DM import qualified Data.Map as DM
...@@ -43,6 +45,7 @@ import Gargantext.Database.Types.Node (HyperdataDocument(..)) ...@@ -43,6 +45,7 @@ import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Database.User (getUser, UserLight(..), Username) import Gargantext.Database.User (getUser, UserLight(..), Username)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Text.Parsers (parseDocs, FileFormat) import Gargantext.Text.Parsers (parseDocs, FileFormat)
import Gargantext.Ext.IMT (toSchoolName)
type UserId = Int type UserId = Int
type RootId = Int type RootId = Int
...@@ -56,16 +59,34 @@ flowDatabase ff fp cName = do ...@@ -56,16 +59,34 @@ flowDatabase ff fp cName = do
-- Documents Flow -- Documents Flow
hyperdataDocuments <- map addUniqIds <$> parseDocs ff fp hyperdataDocuments <- map addUniqIds <$> parseDocs ff fp
--printDebug "hyperdataDocuments" hyperdataDocuments
ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
printDebug "Docs IDs : " (length ids) --printDebug "Docs IDs : " (ids)
idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
printDebug "Repeated Docs IDs : " (length ids) --printDebug "Repeated Docs IDs : " (length ids)
-- Ngrams Flow -- Ngrams Flow
-- todo: flow for new documents only
let tids = toInserted ids
--printDebug "toInserted ids" (length tids, tids)
let tihs = toInsert hyperdataDocuments
--printDebug "toInsert hyperdataDocuments" (length tihs, tihs)
let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments) let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
-- printDebug "documentsWithId" documentsWithId
let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
printDebug "docsWithNgrams" docsWithNgrams
{-
let maps = mapNodeIdNgrams docsWithNgrams let maps = mapNodeIdNgrams docsWithNgrams
printDebug "maps" (maps)
indexedNgrams <- runCmd' $ indexNgrams maps indexedNgrams <- runCmd' $ indexNgrams maps
printDebug "inserted ngrams" indexedNgrams
_ <- runCmd' $ insertToNodeNgrams indexedNgrams _ <- runCmd' $ insertToNodeNgrams indexedNgrams
-- List Flow -- List Flow
...@@ -74,10 +95,12 @@ flowDatabase ff fp cName = do ...@@ -74,10 +95,12 @@ flowDatabase ff fp cName = do
printDebug "Docs IDs : " (length idsRepeat) printDebug "Docs IDs : " (length idsRepeat)
(_, _, corpusId2) <- subFlow "alexandre" cName -}
(_, _, corpusId2) <- subFlow "user1" cName
{-
inserted <- runCmd' $ add corpusId2 (map reId ids) inserted <- runCmd' $ add corpusId2 (map reId ids)
printDebug "Inserted : " (length inserted) printDebug "Inserted : " (length inserted)
-}
pure [corpusId2, corpusId] pure [corpusId2, corpusId]
--runCmd' $ del [corpusId2, corpusId] --runCmd' $ del [corpusId2, corpusId]
...@@ -116,7 +139,7 @@ type NodeId = Int ...@@ -116,7 +139,7 @@ type NodeId = Int
type ListId = Int type ListId = Int
toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqIdBdd d), d)) toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqId d), d))
where where
hash = maybe "Error" identity hash = maybe "Error" identity
...@@ -127,12 +150,12 @@ toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) ) ...@@ -127,12 +150,12 @@ toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
data DocumentWithId = data DocumentWithId =
DocumentWithId { documentId :: NodeId DocumentWithId { documentId :: NodeId
, documentData :: HyperdataDocument , documentData :: HyperdataDocument
} } deriving (Show)
mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId] mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
mergeData rs hs = map (\(hash,r) -> DocumentWithId (reId r) (lookup' hash hs)) $ DM.toList rs mergeData rs hs = map (\(hash,hpd) -> DocumentWithId (lookup' hash rs) hpd) $ DM.toList hs
where where
lookup' h xs = maybe (panic $ "Error with " <> h) identity (DM.lookup h xs) lookup' h xs = maybe (panic $ "Database.Flow.mergeData: Error with " <> h) reId (DM.lookup h rs)
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -140,16 +163,20 @@ data DocumentIdWithNgrams = ...@@ -140,16 +163,20 @@ data DocumentIdWithNgrams =
DocumentIdWithNgrams DocumentIdWithNgrams
{ documentWithId :: DocumentWithId { documentWithId :: DocumentWithId
, document_ngrams :: Map (NgramsT Ngrams) Int , document_ngrams :: Map (NgramsT Ngrams) Int
} } deriving (Show)
-- TODO add Authors and Terms (Title + Abstract) -- TODO add Terms (Title + Abstract)
-- add f :: Text -> Text -- add f :: Text -> Text
-- newtype Ngrams = Ngrams Text -- newtype Ngrams = Ngrams Text
extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
extractNgramsT doc = DM.fromList $ [(NgramsT Sources ngrams, 1)] extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
<> [(NgramsT Institutes i' , 1)| i' <- institutes ]
<> [(NgramsT Authors a' , 1)| a' <- authors ]
where where
ngrams = text2ngrams $ maybe "Nothing" identity maybeNgrams source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
maybeNgrams = _hyperdataDocument_source doc institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
-- TODO group terms
documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int) documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
-> [DocumentWithId] -> [DocumentIdWithNgrams] -> [DocumentWithId] -> [DocumentIdWithNgrams]
......
...@@ -76,13 +76,14 @@ import qualified Database.PostgreSQL.Simple as DPS ...@@ -76,13 +76,14 @@ import qualified Database.PostgreSQL.Simple as DPS
-- ngrams in source field of document has Sources Type -- ngrams in source field of document has Sources Type
-- ngrams in authors field of document has Authors Type -- ngrams in authors field of document has Authors Type
-- ngrams in text (title or abstract) of documents has Terms Type -- ngrams in text (title or abstract) of documents has Terms Type
data NgramsType = Sources | Authors | Terms data NgramsType = Authors | Institutes | Sources | Terms
deriving (Eq) deriving (Eq, Show)
ngramsTypeId :: NgramsType -> Int ngramsTypeId :: NgramsType -> Int
ngramsTypeId Terms = 1 ngramsTypeId Authors = 1
ngramsTypeId Authors = 2 ngramsTypeId Institutes = 2
ngramsTypeId Sources = 3 ngramsTypeId Sources = 3
ngramsTypeId Terms = 4
type NgramsTerms = Text type NgramsTerms = Text
type NgramsId = Int type NgramsId = Int
...@@ -92,7 +93,7 @@ type Size = Int ...@@ -92,7 +93,7 @@ type Size = Int
-- | TODO put it in Gargantext.Text.Ngrams -- | TODO put it in Gargantext.Text.Ngrams
data Ngrams = Ngrams { _ngramsTerms :: Text data Ngrams = Ngrams { _ngramsTerms :: Text
, _ngramsSize :: Int , _ngramsSize :: Int
} deriving (Generic) } deriving (Generic, Show)
instance Eq Ngrams where instance Eq Ngrams where
(==) = (==) (==) = (==)
instance Ord Ngrams where instance Ord Ngrams where
...@@ -110,7 +111,7 @@ text2ngrams txt = Ngrams txt $ length $ splitOn " " txt ...@@ -110,7 +111,7 @@ text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
data NgramsT a = data NgramsT a =
NgramsT { _ngramsType :: NgramsType NgramsT { _ngramsType :: NgramsType
, _ngramsT :: a , _ngramsT :: a
} deriving (Generic) } deriving (Generic, Show)
instance Eq (NgramsT a) instance Eq (NgramsT a)
where (==) = (==) where (==) = (==)
...@@ -127,7 +128,7 @@ data NgramsIndexed = ...@@ -127,7 +128,7 @@ data NgramsIndexed =
NgramsIndexed NgramsIndexed
{ _ngrams :: Ngrams { _ngrams :: Ngrams
, _ngramsId :: NgramsId , _ngramsId :: NgramsId
} deriving (Generic) } deriving (Show, Generic)
instance Eq NgramsIndexed where instance Eq NgramsIndexed where
(==) = (==) (==) = (==)
......
...@@ -94,6 +94,10 @@ hal_data = snd <$> CSV.readHal "doc/corpus_imt/Gargantext_Corpus.csv" ...@@ -94,6 +94,10 @@ hal_data = snd <$> CSV.readHal "doc/corpus_imt/Gargantext_Corpus.csv"
names :: S.Set Text names :: S.Set Text
names = S.fromList $ Gargantext.Prelude.map (\s -> school_id s) schools names = S.fromList $ Gargantext.Prelude.map (\s -> school_id s) schools
toSchoolName :: Text -> Text
toSchoolName t = case M.lookup t mapIdSchool of
Nothing -> t
Just t' -> t'
publisBySchool :: DV.Vector CsvHal -> [(Maybe Text, Int)] publisBySchool :: DV.Vector CsvHal -> [(Maybe Text, Int)]
publisBySchool hal_data' = Gargantext.Prelude.map (\(i,n) -> (M.lookup i mapIdSchool, n)) publisBySchool hal_data' = Gargantext.Prelude.map (\(i,n) -> (M.lookup i mapIdSchool, n))
......
...@@ -294,8 +294,8 @@ csvHal2doc (CsvHal title source ...@@ -294,8 +294,8 @@ csvHal2doc (CsvHal title source
Nothing Nothing
Nothing Nothing
(Just title) (Just title)
(Just inst)
(Just authors) (Just authors)
(Just inst)
(Just source) (Just source)
(Just abstract) (Just abstract)
(Just $ pack . show $ jour pub_year pub_month pub_day) (Just $ pack . show $ jour pub_year pub_month pub_day)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment