Commit 8b7506c0 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[PARSERS] refactoring.

parent fa4332db
......@@ -29,7 +29,7 @@ import qualified Gargantext.Text.Parsers.CSV as CSV
type Query = [S.Term]
filterDocs :: [DocId] -> Vector CSV.Doc -> Vector CSV.Doc
filterDocs :: [DocId] -> Vector CSV.CsvGargV3 -> Vector CSV.CsvGargV3
filterDocs docIds = V.filter (\doc -> S.member (CSV.d_docId doc) $ S.fromList docIds )
......
......@@ -57,7 +57,7 @@ import Gargantext.Core.Types
import Gargantext.Text.Terms
import Gargantext.Text.Context
import Gargantext.Text.Terms.WithList
import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
import Gargantext.Text.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year)
import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms (terms)
import Gargantext.Text.Metrics.Count (coocOnContexts, Coocs)
......@@ -105,7 +105,7 @@ main = do
. DV.toList
. DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
. snd
<$> readCsv corpusFile
<$> readFile corpusFile
-- termListMap :: [Text]
termList <- csvGraphTermList termListFile
......
......@@ -96,6 +96,7 @@ library:
- conduit-extra
- containers
- contravariant
- crawlerPubMed
- data-time-segment
- directory
- duckling
......
......@@ -46,7 +46,7 @@ headerCsvGargV3 = header [ "title"
, "authors"
]
---------------------------------------------------------------
data Doc = Doc
data CsvGargV3 = CsvGargV3
{ d_docId :: !Int
, d_title :: !Text
, d_source :: !Text
......@@ -59,9 +59,8 @@ data Doc = Doc
deriving (Show)
---------------------------------------------------------------
-- | Doc 2 HyperdataDocument
doc2hyperdataDocument :: Doc -> HyperdataDocument
--doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
toDoc :: CsvGargV3 -> HyperdataDocument
toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
HyperdataDocument (Just "CSV")
(Just . pack . show $ did)
Nothing
......@@ -82,25 +81,22 @@ doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
Nothing
Nothing
---------------------------------------------------------------
-- | Types Conversions
toDocs :: Vector CsvDoc -> [Doc]
toDocs :: Vector CsvDoc -> [CsvGargV3]
toDocs v = V.toList
$ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
-> Doc nId t s py pm pd abst auth )
-> CsvGargV3 nId t s py pm pd abst auth )
(V.enumFromN 1 (V.length v'')) v''
where
v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
---------------------------------------------------------------
fromDocs :: Vector Doc -> Vector CsvDoc
fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
fromDocs docs = V.map fromDocs' docs
where
fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
---------------------------------------------------------------
-- | Split a document in its context
......@@ -201,25 +197,25 @@ delimiter = fromIntegral $ ord '\t'
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
<$> snd
<$> readFile fp
readCsvOn fields fp = V.toList
<$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
<$> snd
<$> readFile fp
------------------------------------------------------------------------
readFileLazy :: (FromNamedRecord a) => a -> FilePath -> IO (Header, Vector a)
readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
readFileStrict :: (FromNamedRecord a) => a -> FilePath -> IO (Header, Vector a)
readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a) => a -> BL.ByteString -> (Header, Vector a)
readByteStringLazy f bs = case decodeByNameWith csvDecodeOptions bs of
readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right csvDocs -> csvDocs
readByteStringStrict :: (FromNamedRecord a) => a -> BS.ByteString -> (Header, Vector a)
readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
------------------------------------------------------------------------
......@@ -227,6 +223,7 @@ readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
readFile :: FilePath -> IO (Header, Vector CsvDoc)
readFile = fmap readCsvLazyBS . BL.readFile
-- | TODO use readByteStringLazy
readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
......
......@@ -35,7 +35,7 @@ import Gargantext.Text.Parsers.CSV
type DocId = Int
type DocSearchEngine = SearchEngine
Doc
CsvGargV3
DocId
DocField
NoFeatures
......@@ -48,7 +48,7 @@ initialDocSearchEngine :: DocSearchEngine
initialDocSearchEngine =
initSearchEngine docSearchConfig defaultSearchRankParameters
docSearchConfig :: SearchConfig Doc DocId DocField NoFeatures
docSearchConfig :: SearchConfig CsvGargV3 DocId DocField NoFeatures
docSearchConfig =
SearchConfig {
documentKey = d_docId,
......@@ -57,7 +57,7 @@ docSearchConfig =
documentFeatureValue = const noFeatures
}
where
extractTerms :: Doc -> DocField -> [Text]
extractTerms :: CsvGargV3 -> DocField -> [Text]
extractTerms doc TitleField = monoTexts (d_title doc)
extractTerms doc AbstractField = monoTexts (d_abstract doc)
......
......@@ -25,6 +25,8 @@ extra-deps:
commit: 3fe28b683aba5ddf05e3b5f8eced0bd05c5a29f9
- git: https://github.com/robstewart57/rdf4h.git
commit: 4fd2edf30c141600ffad6d730cc4c1c08a6dbce4
- git: https://gitlab.iscpif.fr/gargantext/crawlers/pubmed
commit: dcaa0f5dd53f20648f4f5a615d29163582a4219c
#- opaleye-0.6.7002.0
- KMP-0.1.0.2
- accelerate-1.2.0.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment