Commit 1a498118 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] WOS Parser

parent 53e86575
Pipeline #3628 failed with stage
in 53 minutes and 54 seconds
......@@ -3,6 +3,7 @@
* [BACK][FIX] Username and email to lowerCase always. Use migration script please to avoid errors.
* [BACK][FIX][Ngrams Change insert causes Database error (#173)](https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/173)
* [FRONTED][CLEAN] Removing Isidore DB for now
* [BACK][FIX] WOS Parser
## Version 0.0.6.9.3
......
......@@ -20,14 +20,15 @@ please follow the types.
{-# LANGUAGE PackageImports #-}
module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC)
module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn)
where
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Conduit
import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Monad.Trans.Control (MonadBaseControl)
import Control.Monad (join)
import Control.Monad.Trans.Control (MonadBaseControl)
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers)
......@@ -38,25 +39,24 @@ import Data.String()
import Data.Text (Text, intercalate, pack, unpack)
import Data.Text.Encoding (decodeUtf8)
import Data.Tuple.Extra (both, first, second)
import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import System.FilePath (FilePath(), takeExtension)
import System.IO.Temp (emptySystemTempFile)
import qualified Data.ByteString as DB
import qualified Data.ByteString.Char8 as DBC
import qualified Data.ByteString.Lazy as DBL
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Prelude
import System.IO.Temp (emptySystemTempFile)
import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
import qualified Prelude
import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
------------------------------------------------------------------------
type ParseError = String
......@@ -168,12 +168,15 @@ parseFormatC _ _ _ = undefined
parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseFile CsvHal Plain p = parseHal p
parseFile CsvGargV3 Plain p = parseCsv p
parseFile RisPresse Plain p = do
docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
pure $ Right docs
parseFile WOS Plain p = do
docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
pure $ Right docs
parseFile ff _ p = do
docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
pure $ Right docs
......@@ -184,19 +187,19 @@ toDoc ff d = do
-- let abstract = lookup "abstract" d
let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
pure HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
, _hd_doi = lookup "doi" d
, _hd_url = lookup "URL" d
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = lookup "title" d
, _hd_authors = Nothing
, _hd_institutes = lookup "authors" d
, _hd_authors = lookup "authors" d
, _hd_institutes = lookup "institutes" d
, _hd_source = lookup "source" d
, _hd_abstract = lookup "abstract" d
, _hd_publication_date = fmap (DT.pack . show) utcTime
......@@ -207,6 +210,8 @@ toDoc ff d = do
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (DT.pack . show) lang }
printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
pure hd
enrichWith :: FileType
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
......@@ -267,3 +272,10 @@ clean txt = DBC.map clean' txt
clean' '\t' = ' '
clean' ';' = '.'
clean' c = c
--
splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
splitOn Authors (Just "WOS") = (DT.splitOn "; ")
splitOn _ _ = (DT.splitOn ", ")
......@@ -23,7 +23,7 @@ import Data.List (lookup)
import Control.Applicative
import Data.Attoparsec.ByteString (Parser, try, takeTill, take, many1)
import Data.Attoparsec.ByteString.Char8 (isEndOfLine)
import Data.ByteString (ByteString, concat)
import Data.ByteString (ByteString, intercalate)
import Gargantext.Prelude hiding (takeWhile, take)
import qualified Data.List as DL
-------------------------------------------------------------
......@@ -55,7 +55,7 @@ fieldWith n = do
let txts' = case DL.length txts > 0 of
True -> txts
False -> []
pure (name, concat ([txt] <> txts'))
pure (name, intercalate ";" ([txt] <> txts'))
lines :: Parser [ByteString]
......@@ -70,5 +70,3 @@ onField :: ByteString -> (ByteString -> [(ByteString, ByteString)])
-> [(ByteString, ByteString)] -> [(ByteString, ByteString)]
onField k f m = m <> ( maybe [] f (lookup k m) )
......@@ -52,6 +52,7 @@ keys field
| field == "TI" = "title"
| field == "SO" = "source"
| field == "DI" = "doi"
| field == "PY" = "publication_date"
| field == "PD" = "publication_date"
| field == "SP" = "institutes"
| field == "AB" = "abstract"
| otherwise = field
{-|
Module : Gargantext.Core.Text.Ngrams.List.Management
Description : Tools to manage lists
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Core.Text.List.Management
where
{-
import Data.HashMap.Strict (HashMap)
import Data.Map (Map)
import Gargantext.API.Ngrams
import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
import Gargantext.Database.Action.Flow.Types
import Gargantext.API.Ngrams.Tools (getListNgrams)
import Gargantext.Core.NodeStory
import Gargantext.Core.Text (size)
import Gargantext.Core.Text.List.Group
import Gargantext.Core.Text.List.Group.Prelude
import Gargantext.Core.Text.List.Group.WithStem
import Gargantext.Core.Text.List.Social
import Gargantext.Core.Text.List.Social.Prelude
import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
import Gargantext.Core.Types (ListType(..), CorpusId, ListId)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser)
import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
import Gargantext.Database.Admin.Types.Node (NodeId)
import Gargantext.Database.Prelude (CmdM)
import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
import Gargantext.Database.Query.Table.Node (defaultList, getClosestParentIdByType)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
import Gargantext.Database.Query.Tree.Error (HasTreeError)
import Gargantext.Database.Action.Metrics.NgramsByContext (getOccByNgramsOnlyFast')
import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
import Gargantext.Prelude
import qualified Data.HashMap.Strict as HashMap
import qualified Data.HashSet as HashSet
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
restrictListSize
:: forall env err m.
(HasNodeStory env err m, FlowCmdM env err m)
=> CorpusId
-> ListId
-> NgramsType
-> ListType
-> Int -- ^ number of ngram pairs to keep
-> m ()
restrictListSize corpusId listId ngramsType listType size = do
ngrams <- getListNgrams [listId] ngramsType
-- corpus_id <- getClosestParentIdByType
occurrences <- getOccByNgramsOnlyFast' corpusId
listId
ngramsType
(HashMap.keys ngrams)
ngrams' <- filterWith listType size occurrences ngrams
_ <- setListNgrams listId ngramsType ngrams'
return ()
where filterWith :: ListType -> Int -> HashMap NgramsTerm Int
-> HashMap NgramsTerm NgramsRepoElement
-> m (Map NgramsTerm NgramsRepoElement)
filterWith listType' size occs ngrams =
HashMap.filter with ngrams
where
with nre = case (&&) <$> Just (nre^.nre_list == listType)
<*> ( HashMap.lookup (nre^.nre_root) occs
&&
-}
......@@ -75,11 +75,11 @@ import qualified Data.Conduit as C
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
import Gargantext.Core (Lang(..), PosTagAlgo(..))
import Gargantext.Core.Ext.IMT (toSchoolName)
-- import Gargantext.Core.Ext.IMT (toSchoolName)
import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
import Gargantext.Core.Flow.Types
import Gargantext.Core.Text
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType)
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType, splitOn)
import Gargantext.Core.Text.List (buildNgramsLists)
import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
......@@ -550,13 +550,14 @@ instance ExtractNgramsT HyperdataDocument
$ _hd_source doc
institutes = map text2ngrams
$ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
$ maybe ["Nothing"] (splitOn Institutes (doc^. hd_bdd))
$ _hd_institutes doc
authors = map text2ngrams
$ maybe ["Nothing"] (T.splitOn ", ")
$ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
$ _hd_authors doc
termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
<$> concat
<$> liftBase (extractTerms lang' $ hasText doc)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment