Commit 77db4880 authored by Administrator's avatar Administrator

Merge branch 'alex'

Fix
parents 2bebc82d b85dd6b8
...@@ -17,9 +17,20 @@ class EuropressFileParser(FileParser): ...@@ -17,9 +17,20 @@ class EuropressFileParser(FileParser):
codif = "UTF-8" codif = "UTF-8"
count = 0 count = 0
html_parser = etree.HTMLParser(encoding=codif) if isinstance(file, str):
html = etree.parse(file, html_parser) file = open(file, 'rb')
print(file)
contents = file.read()
print(len(contents))
#return []
encoding = self.detect_encoding(contents)
try:
html_parser = etree.HTMLParser(encoding=encoding)
html = etree.fromstring(contents, html_parser)
html_articles = html.xpath('/html/body/table') html_articles = html.xpath('/html/body/table')
except:
return []
# initialize the list of metadata # initialize the list of metadata
metadata_list = [] metadata_list = []
...@@ -43,7 +54,7 @@ class EuropressFileParser(FileParser): ...@@ -43,7 +54,7 @@ class EuropressFileParser(FileParser):
for header in html_article.xpath("./tr/td/span[@class = 'DocHeader']"): for header in html_article.xpath("./tr/td/span[@class = 'DocHeader']"):
text = header.text text = header.text
if isinstance(text, bytes): if isinstance(text, bytes):
text = text.decode() text = text.decode(encoding)
format_date_fr = re.compile('\d+\s*\w+\s+\d{4}', re.UNICODE) format_date_fr = re.compile('\d+\s*\w+\s+\d{4}', re.UNICODE)
test_date_fr = format_date_fr.match(text) test_date_fr = format_date_fr.match(text)
......
import collections import collections
import dateutil.parser import dateutil.parser
import zipfile import zipfile
import chardet
from parsing.Caches import LanguagesCache from parsing.Caches import LanguagesCache
...@@ -14,7 +15,8 @@ class FileParser: ...@@ -14,7 +15,8 @@ class FileParser:
def detect_encoding(self, string): def detect_encoding(self, string):
"""Useful method to detect the document encoding. """Useful method to detect the document encoding.
""" """
pass encoding = chardet.detect(string)
return encoding.get('encoding', 'UTF-8')
def format_metadata_dates(self, metadata): def format_metadata_dates(self, metadata):
......
...@@ -379,38 +379,38 @@ buttonAddDataset.click(function() { ...@@ -379,38 +379,38 @@ buttonAddDataset.click(function() {
// $('.tree').jstree({ $('.tree').jstree({
// 'core' : { 'core' : {
// 'data' : { 'data' : {
// 'url' : function(node) { 'url' : function(node) {
// var url = '/api/nodes?' + ((node.id === '#') var url = '/api/nodes?' + ((node.id === '#')
// ? 'type=Project' ? 'type=Project'
// : ('parent=' + node.id) : ('parent=' + node.id)
// ); );
// console.log(url); console.log(url);
// return url; return url;
// }, },
// }, },
// }, },
// "plugins" : ["types"], "plugins" : ["types"],
// "types" : { "types" : {
// "#" : { "#" : {
// "max_children" : 1, "max_children" : 1,
// "max_depth" : 4, "max_depth" : 4,
// "valid_children" : ["root"] "valid_children" : ["root"]
// }, },
// "Project" : { "Project" : {
// "icon" : "http://www.jstree.com/static/3.0.8/assets/images/tree_icon.png", "icon" : "http://www.jstree.com/static/3.0.8/assets/images/tree_icon.png",
// "valid_children" : ["default"] "valid_children" : ["default"]
// }, },
// "Corpus" : { "Corpus" : {
// "valid_children" : ["default","file"] "valid_children" : ["default","file"]
// }, },
// "Document" : { "Document" : {
// "icon" : "glyphicon glyphicon-file", "icon" : "glyphicon glyphicon-file",
// "valid_children" : [] "valid_children" : []
// } }
// }, },
// }); });
// var graph = $('.graph-it').graphIt(640, 480); // var graph = $('.graph-it').graphIt(640, 480);
...@@ -103,7 +103,7 @@ ...@@ -103,7 +103,7 @@
<div class="row"> <div class="row">
<div class="col-md-4"> <div class="col-md-4">
<div class="jumbotron"> <div class="jumbotron">
<h3><a href="/graph-it">1) Documents</a></h3> <h3><a href="/graph-it">Documents</a></h3>
<ol> <ol>
<li>Read</li> <!-- write --> <li>Read</li> <!-- write -->
<li>Count</li> <!-- compute --> <li>Count</li> <!-- compute -->
...@@ -114,7 +114,7 @@ ...@@ -114,7 +114,7 @@
<div class="col-md-4"> <div class="col-md-4">
<div class="jumbotron"> <div class="jumbotron">
<h3><a href="/ngrams">2) Ngrams</a></h3> <h3><a href="/ngrams">Dictionaries</a></h3>
<ol> <ol>
<li>White Lists</li> <li>White Lists</li>
<li>Black Lists</li> <li>Black Lists</li>
...@@ -125,7 +125,7 @@ ...@@ -125,7 +125,7 @@
<div class="col-md-4"> <div class="col-md-4">
<div class="jumbotron"> <div class="jumbotron">
<h3><a href="/graph">3) Visualizations</a></h3> <h3><a href="/graph">Visualizations</a></h3>
<ol> <ol>
<li>Matrix</li> <li>Matrix</li>
<li>Static maps</li> <li>Static maps</li>
......
...@@ -77,10 +77,9 @@ ...@@ -77,10 +77,9 @@
<script type="text/javascript" src="{% static "js/jquery/jquery.min.js" %}"></script> <script type="text/javascript" src="{% static "js/jquery/jquery.min.js" %}"></script>
<script type="text/javascript" src="{% static "js/jquery/jquery-ui.js" %}"></script> <script type="text/javascript" src="{% static "js/jquery/jquery-ui.js" %}"></script>
<!--
<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/jstree/3.0.4/themes/default/style.min.css" /> <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/jstree/3.0.4/themes/default/style.min.css" />
<script src="//cdnjs.cloudflare.com/ajax/libs/jstree/3.0.4/jstree.min.js"></script> <script src="//cdnjs.cloudflare.com/ajax/libs/jstree/3.0.4/jstree.min.js"></script>
-->
<script type="text/javascript" src="{% static "js/charts/dygraph-combined.js" %}"></script> <script type="text/javascript" src="{% static "js/charts/dygraph-combined.js" %}"></script>
<script type="text/javascript" src="{% static "js/graph-it.js" %}"></script> <script type="text/javascript" src="{% static "js/graph-it.js" %}"></script>
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
<div class="jumbotron"> <div class="jumbotron">
<h1>Gargantext</h1> <h1>Gargantext</h1>
<p>A web platform to explore text-mining</p> <p>A web platform to explore text-mining</p>
<a class="btn btn-primary btn-lg" href="/projects">Explore a corpus</a> <a class="btn btn-primary btn-lg" href="/projects">Test Gargantext</a>
</div> </div>
<div class="container"> <div class="container">
......
...@@ -79,7 +79,14 @@ ...@@ -79,7 +79,14 @@
<li> <li>
<a href="/project/{{project.id}}/corpus/{{corpus.id}}">{{corpus.name}}</a> <a href="/project/{{project.id}}/corpus/{{corpus.id}}">{{corpus.name}}</a>
, {{ corpus.count }} Documents , {{ corpus.count }} Documents
<button type="button" class="btn btn-xs btn-default" data-container="body" data-toggle="popover" data-placement="bottom" data-content='<a href="/project/{{ project.id }}/corpus/{{ corpus.id}}/delete">Yes, I am sure!</a>'>Delete</button> <button type="button" class="btn btn-xs btn-default" data-container="body" data-toggle="popover" data-placement="bottom"
data-content='
<ul>
<li> Add documents </li>
<li> Rename </li>
<li><a href="/project/{{ project.id }}/corpus/{{ corpus.id}}/delete">Delete</a></li>
</ul>
'>Manage</button>
</li> </li>
{% endfor %} {% endfor %}
</ul> </ul>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment