Commit 8b7506c0 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[PARSERS] refactoring.

parent fa4332db
...@@ -29,7 +29,7 @@ import qualified Gargantext.Text.Parsers.CSV as CSV ...@@ -29,7 +29,7 @@ import qualified Gargantext.Text.Parsers.CSV as CSV
type Query = [S.Term] type Query = [S.Term]
filterDocs :: [DocId] -> Vector CSV.Doc -> Vector CSV.Doc filterDocs :: [DocId] -> Vector CSV.CsvGargV3 -> Vector CSV.CsvGargV3
filterDocs docIds = V.filter (\doc -> S.member (CSV.d_docId doc) $ S.fromList docIds ) filterDocs docIds = V.filter (\doc -> S.member (CSV.d_docId doc) $ S.fromList docIds )
......
...@@ -57,7 +57,7 @@ import Gargantext.Core.Types ...@@ -57,7 +57,7 @@ import Gargantext.Core.Types
import Gargantext.Text.Terms import Gargantext.Text.Terms
import Gargantext.Text.Context import Gargantext.Text.Context
import Gargantext.Text.Terms.WithList import Gargantext.Text.Terms.WithList
import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year) import Gargantext.Text.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year)
import Gargantext.Text.List.CSV (csvGraphTermList) import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms (terms) import Gargantext.Text.Terms (terms)
import Gargantext.Text.Metrics.Count (coocOnContexts, Coocs) import Gargantext.Text.Metrics.Count (coocOnContexts, Coocs)
...@@ -105,7 +105,7 @@ main = do ...@@ -105,7 +105,7 @@ main = do
. DV.toList . DV.toList
. DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)])) . DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
. snd . snd
<$> readCsv corpusFile <$> readFile corpusFile
-- termListMap :: [Text] -- termListMap :: [Text]
termList <- csvGraphTermList termListFile termList <- csvGraphTermList termListFile
......
...@@ -96,6 +96,7 @@ library: ...@@ -96,6 +96,7 @@ library:
- conduit-extra - conduit-extra
- containers - containers
- contravariant - contravariant
- crawlerPubMed
- data-time-segment - data-time-segment
- directory - directory
- duckling - duckling
......
...@@ -46,7 +46,7 @@ headerCsvGargV3 = header [ "title" ...@@ -46,7 +46,7 @@ headerCsvGargV3 = header [ "title"
, "authors" , "authors"
] ]
--------------------------------------------------------------- ---------------------------------------------------------------
data Doc = Doc data CsvGargV3 = CsvGargV3
{ d_docId :: !Int { d_docId :: !Int
, d_title :: !Text , d_title :: !Text
, d_source :: !Text , d_source :: !Text
...@@ -59,9 +59,8 @@ data Doc = Doc ...@@ -59,9 +59,8 @@ data Doc = Doc
deriving (Show) deriving (Show)
--------------------------------------------------------------- ---------------------------------------------------------------
-- | Doc 2 HyperdataDocument -- | Doc 2 HyperdataDocument
doc2hyperdataDocument :: Doc -> HyperdataDocument toDoc :: CsvGargV3 -> HyperdataDocument
--doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) = toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
HyperdataDocument (Just "CSV") HyperdataDocument (Just "CSV")
(Just . pack . show $ did) (Just . pack . show $ did)
Nothing Nothing
...@@ -82,25 +81,22 @@ doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) = ...@@ -82,25 +81,22 @@ doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
Nothing Nothing
Nothing Nothing
--------------------------------------------------------------- ---------------------------------------------------------------
-- | Types Conversions -- | Types Conversions
toDocs :: Vector CsvDoc -> [Doc] toDocs :: Vector CsvDoc -> [CsvGargV3]
toDocs v = V.toList toDocs v = V.toList
$ V.zipWith (\nId (CsvDoc t s py pm pd abst auth) $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
-> Doc nId t s py pm pd abst auth ) -> CsvGargV3 nId t s py pm pd abst auth )
(V.enumFromN 1 (V.length v'')) v'' (V.enumFromN 1 (V.length v'')) v''
where where
v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3]) seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
--------------------------------------------------------------- ---------------------------------------------------------------
fromDocs :: Vector Doc -> Vector CsvDoc fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
fromDocs docs = V.map fromDocs' docs fromDocs docs = V.map fromDocs' docs
where where
fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth) fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
--------------------------------------------------------------- ---------------------------------------------------------------
-- | Split a document in its context -- | Split a document in its context
...@@ -201,25 +197,25 @@ delimiter = fromIntegral $ ord '\t' ...@@ -201,25 +197,25 @@ delimiter = fromIntegral $ ord '\t'
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text] readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields) readCsvOn fields fp = V.toList
<$> snd <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
<$> readFile fp <$> snd
<$> readFile fp
------------------------------------------------------------------------ ------------------------------------------------------------------------
readFileLazy :: (FromNamedRecord a) => a -> FilePath -> IO (Header, Vector a) readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
readFileLazy f = fmap (readByteStringLazy f) . BL.readFile readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
readFileStrict :: (FromNamedRecord a) => a -> FilePath -> IO (Header, Vector a) readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
readFileStrict f = fmap (readByteStringStrict f) . BS.readFile readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
readByteStringLazy :: (FromNamedRecord a) => a -> BL.ByteString -> (Header, Vector a) readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
readByteStringLazy f bs = case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e) Left e -> panic (pack e)
Right csvDocs -> csvDocs Right csvDocs -> csvDocs
readByteStringStrict :: (FromNamedRecord a) => a -> BS.ByteString -> (Header, Vector a) readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -227,6 +223,7 @@ readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict ...@@ -227,6 +223,7 @@ readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
readFile :: FilePath -> IO (Header, Vector CsvDoc) readFile :: FilePath -> IO (Header, Vector CsvDoc)
readFile = fmap readCsvLazyBS . BL.readFile readFile = fmap readCsvLazyBS . BL.readFile
-- | TODO use readByteStringLazy -- | TODO use readByteStringLazy
readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc) readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
......
...@@ -35,7 +35,7 @@ import Gargantext.Text.Parsers.CSV ...@@ -35,7 +35,7 @@ import Gargantext.Text.Parsers.CSV
type DocId = Int type DocId = Int
type DocSearchEngine = SearchEngine type DocSearchEngine = SearchEngine
Doc CsvGargV3
DocId DocId
DocField DocField
NoFeatures NoFeatures
...@@ -48,7 +48,7 @@ initialDocSearchEngine :: DocSearchEngine ...@@ -48,7 +48,7 @@ initialDocSearchEngine :: DocSearchEngine
initialDocSearchEngine = initialDocSearchEngine =
initSearchEngine docSearchConfig defaultSearchRankParameters initSearchEngine docSearchConfig defaultSearchRankParameters
docSearchConfig :: SearchConfig Doc DocId DocField NoFeatures docSearchConfig :: SearchConfig CsvGargV3 DocId DocField NoFeatures
docSearchConfig = docSearchConfig =
SearchConfig { SearchConfig {
documentKey = d_docId, documentKey = d_docId,
...@@ -57,7 +57,7 @@ docSearchConfig = ...@@ -57,7 +57,7 @@ docSearchConfig =
documentFeatureValue = const noFeatures documentFeatureValue = const noFeatures
} }
where where
extractTerms :: Doc -> DocField -> [Text] extractTerms :: CsvGargV3 -> DocField -> [Text]
extractTerms doc TitleField = monoTexts (d_title doc) extractTerms doc TitleField = monoTexts (d_title doc)
extractTerms doc AbstractField = monoTexts (d_abstract doc) extractTerms doc AbstractField = monoTexts (d_abstract doc)
......
...@@ -25,6 +25,8 @@ extra-deps: ...@@ -25,6 +25,8 @@ extra-deps:
commit: 3fe28b683aba5ddf05e3b5f8eced0bd05c5a29f9 commit: 3fe28b683aba5ddf05e3b5f8eced0bd05c5a29f9
- git: https://github.com/robstewart57/rdf4h.git - git: https://github.com/robstewart57/rdf4h.git
commit: 4fd2edf30c141600ffad6d730cc4c1c08a6dbce4 commit: 4fd2edf30c141600ffad6d730cc4c1c08a6dbce4
- git: https://gitlab.iscpif.fr/gargantext/crawlers/pubmed
commit: dcaa0f5dd53f20648f4f5a615d29163582a4219c
#- opaleye-0.6.7002.0 #- opaleye-0.6.7002.0
- KMP-0.1.0.2 - KMP-0.1.0.2
- accelerate-1.2.0.0 - accelerate-1.2.0.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment