Commit 1a498118 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] WOS Parser

parent 53e86575
Pipeline #3628 failed with stage
in 53 minutes and 54 seconds
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
* [BACK][FIX] Username and email to lowerCase always. Use migration script please to avoid errors. * [BACK][FIX] Username and email to lowerCase always. Use migration script please to avoid errors.
* [BACK][FIX][Ngrams Change insert causes Database error (#173)](https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/173) * [BACK][FIX][Ngrams Change insert causes Database error (#173)](https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/173)
* [FRONTED][CLEAN] Removing Isidore DB for now * [FRONTED][CLEAN] Removing Isidore DB for now
* [BACK][FIX] WOS Parser
## Version 0.0.6.9.3 ## Version 0.0.6.9.3
......
...@@ -20,14 +20,15 @@ please follow the types. ...@@ -20,14 +20,15 @@ please follow the types.
{-# LANGUAGE PackageImports #-} {-# LANGUAGE PackageImports #-}
module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC) module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn)
where where
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries) import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Conduit import Conduit
import Control.Concurrent.Async as CCA (mapConcurrently) import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Monad.Trans.Control (MonadBaseControl)
import Control.Monad (join) import Control.Monad (join)
import Control.Monad.Trans.Control (MonadBaseControl)
import Data.Attoparsec.ByteString (parseOnly, Parser) import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Either(Either(..)) import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers) import Data.Either.Extra (partitionEithers)
...@@ -38,25 +39,24 @@ import Data.String() ...@@ -38,25 +39,24 @@ import Data.String()
import Data.Text (Text, intercalate, pack, unpack) import Data.Text (Text, intercalate, pack, unpack)
import Data.Text.Encoding (decodeUtf8) import Data.Text.Encoding (decodeUtf8)
import Data.Tuple.Extra (both, first, second) import Data.Tuple.Extra (both, first, second)
import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import System.FilePath (FilePath(), takeExtension) import System.FilePath (FilePath(), takeExtension)
import System.IO.Temp (emptySystemTempFile)
import qualified Data.ByteString as DB import qualified Data.ByteString as DB
import qualified Data.ByteString.Char8 as DBC import qualified Data.ByteString.Char8 as DBC
import qualified Data.ByteString.Lazy as DBL import qualified Data.ByteString.Lazy as DBL
import qualified Data.Map as DM import qualified Data.Map as DM
import qualified Data.Text as DT import qualified Data.Text as DT
import qualified Prelude
import System.IO.Temp (emptySystemTempFile)
import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
import qualified Prelude
import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
------------------------------------------------------------------------ ------------------------------------------------------------------------
type ParseError = String type ParseError = String
...@@ -168,12 +168,15 @@ parseFormatC _ _ _ = undefined ...@@ -168,12 +168,15 @@ parseFormatC _ _ _ = undefined
parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument]) parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseFile CsvHal Plain p = parseHal p parseFile CsvHal Plain p = parseHal p
parseFile CsvGargV3 Plain p = parseCsv p parseFile CsvGargV3 Plain p = parseCsv p
parseFile RisPresse Plain p = do parseFile RisPresse Plain p = do
docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
pure $ Right docs pure $ Right docs
parseFile WOS Plain p = do parseFile WOS Plain p = do
docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
pure $ Right docs pure $ Right docs
parseFile ff _ p = do parseFile ff _ p = do
docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
pure $ Right docs pure $ Right docs
...@@ -184,19 +187,19 @@ toDoc ff d = do ...@@ -184,19 +187,19 @@ toDoc ff d = do
-- let abstract = lookup "abstract" d -- let abstract = lookup "abstract" d
let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract)) let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
pure HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
, _hd_doi = lookup "doi" d , _hd_doi = lookup "doi" d
, _hd_url = lookup "URL" d , _hd_url = lookup "URL" d
, _hd_uniqId = Nothing , _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing , _hd_uniqIdBdd = Nothing
, _hd_page = Nothing , _hd_page = Nothing
, _hd_title = lookup "title" d , _hd_title = lookup "title" d
, _hd_authors = Nothing , _hd_authors = lookup "authors" d
, _hd_institutes = lookup "authors" d , _hd_institutes = lookup "institutes" d
, _hd_source = lookup "source" d , _hd_source = lookup "source" d
, _hd_abstract = lookup "abstract" d , _hd_abstract = lookup "abstract" d
, _hd_publication_date = fmap (DT.pack . show) utcTime , _hd_publication_date = fmap (DT.pack . show) utcTime
...@@ -207,6 +210,8 @@ toDoc ff d = do ...@@ -207,6 +210,8 @@ toDoc ff d = do
, _hd_publication_minute = Nothing , _hd_publication_minute = Nothing
, _hd_publication_second = Nothing , _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (DT.pack . show) lang } , _hd_language_iso2 = Just $ (DT.pack . show) lang }
printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
pure hd
enrichWith :: FileType enrichWith :: FileType
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]]) -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
...@@ -267,3 +272,10 @@ clean txt = DBC.map clean' txt ...@@ -267,3 +272,10 @@ clean txt = DBC.map clean' txt
clean' '\t' = ' ' clean' '\t' = ' '
clean' ';' = '.' clean' ';' = '.'
clean' c = c clean' c = c
--
splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
splitOn Authors (Just "WOS") = (DT.splitOn "; ")
splitOn _ _ = (DT.splitOn ", ")
...@@ -23,7 +23,7 @@ import Data.List (lookup) ...@@ -23,7 +23,7 @@ import Data.List (lookup)
import Control.Applicative import Control.Applicative
import Data.Attoparsec.ByteString (Parser, try, takeTill, take, many1) import Data.Attoparsec.ByteString (Parser, try, takeTill, take, many1)
import Data.Attoparsec.ByteString.Char8 (isEndOfLine) import Data.Attoparsec.ByteString.Char8 (isEndOfLine)
import Data.ByteString (ByteString, concat) import Data.ByteString (ByteString, intercalate)
import Gargantext.Prelude hiding (takeWhile, take) import Gargantext.Prelude hiding (takeWhile, take)
import qualified Data.List as DL import qualified Data.List as DL
------------------------------------------------------------- -------------------------------------------------------------
...@@ -55,7 +55,7 @@ fieldWith n = do ...@@ -55,7 +55,7 @@ fieldWith n = do
let txts' = case DL.length txts > 0 of let txts' = case DL.length txts > 0 of
True -> txts True -> txts
False -> [] False -> []
pure (name, concat ([txt] <> txts')) pure (name, intercalate ";" ([txt] <> txts'))
lines :: Parser [ByteString] lines :: Parser [ByteString]
...@@ -70,5 +70,3 @@ onField :: ByteString -> (ByteString -> [(ByteString, ByteString)]) ...@@ -70,5 +70,3 @@ onField :: ByteString -> (ByteString -> [(ByteString, ByteString)])
-> [(ByteString, ByteString)] -> [(ByteString, ByteString)] -> [(ByteString, ByteString)] -> [(ByteString, ByteString)]
onField k f m = m <> ( maybe [] f (lookup k m) ) onField k f m = m <> ( maybe [] f (lookup k m) )
...@@ -52,6 +52,7 @@ keys field ...@@ -52,6 +52,7 @@ keys field
| field == "TI" = "title" | field == "TI" = "title"
| field == "SO" = "source" | field == "SO" = "source"
| field == "DI" = "doi" | field == "DI" = "doi"
| field == "PY" = "publication_date" | field == "PD" = "publication_date"
| field == "SP" = "institutes"
| field == "AB" = "abstract" | field == "AB" = "abstract"
| otherwise = field | otherwise = field
{-|
Module : Gargantext.Core.Text.Ngrams.List.Management
Description : Tools to manage lists
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Core.Text.List.Management
where
{-
import Data.HashMap.Strict (HashMap)
import Data.Map (Map)
import Gargantext.API.Ngrams
import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
import Gargantext.Database.Action.Flow.Types
import Gargantext.API.Ngrams.Tools (getListNgrams)
import Gargantext.Core.NodeStory
import Gargantext.Core.Text (size)
import Gargantext.Core.Text.List.Group
import Gargantext.Core.Text.List.Group.Prelude
import Gargantext.Core.Text.List.Group.WithStem
import Gargantext.Core.Text.List.Social
import Gargantext.Core.Text.List.Social.Prelude
import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
import Gargantext.Core.Types (ListType(..), CorpusId, ListId)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser)
import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
import Gargantext.Database.Admin.Types.Node (NodeId)
import Gargantext.Database.Prelude (CmdM)
import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
import Gargantext.Database.Query.Table.Node (defaultList, getClosestParentIdByType)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
import Gargantext.Database.Query.Tree.Error (HasTreeError)
import Gargantext.Database.Action.Metrics.NgramsByContext (getOccByNgramsOnlyFast')
import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
import Gargantext.Prelude
import qualified Data.HashMap.Strict as HashMap
import qualified Data.HashSet as HashSet
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
restrictListSize
:: forall env err m.
(HasNodeStory env err m, FlowCmdM env err m)
=> CorpusId
-> ListId
-> NgramsType
-> ListType
-> Int -- ^ number of ngram pairs to keep
-> m ()
restrictListSize corpusId listId ngramsType listType size = do
ngrams <- getListNgrams [listId] ngramsType
-- corpus_id <- getClosestParentIdByType
occurrences <- getOccByNgramsOnlyFast' corpusId
listId
ngramsType
(HashMap.keys ngrams)
ngrams' <- filterWith listType size occurrences ngrams
_ <- setListNgrams listId ngramsType ngrams'
return ()
where filterWith :: ListType -> Int -> HashMap NgramsTerm Int
-> HashMap NgramsTerm NgramsRepoElement
-> m (Map NgramsTerm NgramsRepoElement)
filterWith listType' size occs ngrams =
HashMap.filter with ngrams
where
with nre = case (&&) <$> Just (nre^.nre_list == listType)
<*> ( HashMap.lookup (nre^.nre_root) occs
&&
-}
...@@ -75,11 +75,11 @@ import qualified Data.Conduit as C ...@@ -75,11 +75,11 @@ import qualified Data.Conduit as C
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..)) import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
import Gargantext.Core (Lang(..), PosTagAlgo(..)) import Gargantext.Core (Lang(..), PosTagAlgo(..))
import Gargantext.Core.Ext.IMT (toSchoolName) -- import Gargantext.Core.Ext.IMT (toSchoolName)
import Gargantext.Core.Ext.IMTUser (readFile_Annuaire) import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
import Gargantext.Core.Flow.Types import Gargantext.Core.Flow.Types
import Gargantext.Core.Text import Gargantext.Core.Text
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType) import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType, splitOn)
import Gargantext.Core.Text.List (buildNgramsLists) import Gargantext.Core.Text.List (buildNgramsLists)
import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..)) import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.List.Social (FlowSocialListWith(..)) import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
...@@ -550,13 +550,14 @@ instance ExtractNgramsT HyperdataDocument ...@@ -550,13 +550,14 @@ instance ExtractNgramsT HyperdataDocument
$ _hd_source doc $ _hd_source doc
institutes = map text2ngrams institutes = map text2ngrams
$ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", ")) $ maybe ["Nothing"] (splitOn Institutes (doc^. hd_bdd))
$ _hd_institutes doc $ _hd_institutes doc
authors = map text2ngrams authors = map text2ngrams
$ maybe ["Nothing"] (T.splitOn ", ") $ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
$ _hd_authors doc $ _hd_authors doc
termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt)) termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
<$> concat <$> concat
<$> liftBase (extractTerms lang' $ hasText doc) <$> liftBase (extractTerms lang' $ hasText doc)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment