Commit 28fb139f authored by Atrax Nicolas's avatar Atrax Nicolas

Add PDFtoTXT and TXTtoTSV tools

parent 7eaf9990
import sys
import os
from lib.tika.tika import parser
from datetime import date
from lib.pdfminer.pdfminer.pdfparser import PDFParser
from lib.pdfminer.pdfminer.pdfdocument import PDFDocument
def replaceNewlines(txt):
tmp = txt.split("\n")
res = ""
for line in tmp :
if line.replace(" ", "") == "":
continue
if res != "":
res += '\n'
res += line
return res
def reSplit(new, old):
res = []
count = 0
for s in old:
tmp = ""
for c in s:
if c == new[count] :
tmp += c
count += 1
if(tmp != "") :
res.append(tmp)
return res
def removeFiligrane(abstract, filigrane):
if filigrane == "" :
return abstract
tmp = abstract.split("\n")
txt = "".join(tmp).replace(filigrane,"")
res = reSplit(txt, tmp)
return res
def getTextFromPDF(fileAddress, filigrane):
parsed_pdf = parser.from_file(fileAddress)
data = parsed_pdf['content']
data = replaceNewlines(data)
return removeFiligrane(data, filigrane)
def getDate(fileAddress):
year = str(date.today().year)
month = "1"
day = "1"
with open(fileAddress, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
info = doc.info[0]
tmp = ""
if "CreationDate" in info:
tmp = info["CreationDate"].decode('utf-8')
year = tmp[2:6]
month = tmp[6:8]
day = tmp[8:10]
return year, month, day
def waitAnswer(question):
answer = input(question + " (Y/N) : ")
while(answer.lower() != "y" and answer.lower() != "n" ):
answer = input("Wrong input ! Answer with either y or n : ")
return answer == "y"
def getFiligrane(fileName):
filigrane = ""
question = "Does the file \"" + fileName + "\" has a filigrane ?"
if waitAnswer(question):
filigrane = input("Please enter the filigrane : ")
return filigrane
def PDFtoTXT(fileAddress):
fileName = fileAddress.replace("pdf/", "")
year, month, day = getDate(fileAddress)
filigrane = getFiligrane(fileName)
txt = "#Date from PDF# : " + month + " " + day + " " + year + "\n\n"
txt += getTextFromPDF(fileAddress, filigrane)
with open("txt/" + fileName.replace(".pdf",".txt"), "w", encoding='utf-8-sig') as file:
file.write(txt)
print("Your file has been converted successfully.")
def main():
if len(sys.argv) == 2:
fileName = sys.argv[1]
PDFtoTXT("pdf/" + fileName)
return
if __name__ == '__main__':
sys.exit(main())
\ No newline at end of file
MIT License
Copyright (c) 2004-2019 Yusuke Shinyama
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
include Makefile
include LICENSE
include *.txt
include *.py
include *.md
graft cmaprsrc
graft docs
graft pdfminer
graft samples
graft tools
## Makefile (for maintenance purpose)
##
PACKAGE=pdfminer
PYTHON=python -B
TWINE=twine
RM=rm -f
CP=cp -f
MKDIR=mkdir
all:
install:
$(PYTHON) setup.py install --home=$(HOME)
clean:
-$(PYTHON) setup.py clean
-$(RM) -r build dist MANIFEST pdfminer.egg-info
-cd $(PACKAGE) && $(MAKE) clean
-cd tools && $(MAKE) clean
-cd samples && $(MAKE) clean
distclean: clean cmap_clean
sdist: distclean MANIFEST.in
$(PYTHON) setup.py sdist
upload: sdist
$(TWINE) check dist/*.tar.gz
$(TWINE) upload dist/*.tar.gz
WEBDIR=../github.io/$(PACKAGE)
publish:
$(CP) docs/*.html docs/*.png docs/*.css $(WEBDIR)
CONV_CMAP=env PYTHONPATH=. $(PYTHON) tools/conv_cmap.py
CMAPSRC=cmaprsrc
CMAPDST=pdfminer/cmap
cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.marshal.gz $(CMAPDST)/to-unicode-Adobe-GB1.marshal.gz \
$(CMAPDST)/to-unicode-Adobe-Japan1.marshal.gz $(CMAPDST)/to-unicode-Adobe-Korea1.marshal.gz
cmap_clean:
-$(RM) -r $(CMAPDST)
$(CMAPDST):
$(MKDIR) $(CMAPDST)
$(CMAPDST)/to-unicode-Adobe-CNS1.marshal.gz: $(CMAPDST)
$(CONV_CMAP) -c B5=cp950 -c UniCNS-UTF8=utf-8 \
$(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt
$(CMAPDST)/to-unicode-Adobe-GB1.marshal.gz: $(CMAPDST)
$(CONV_CMAP) -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 \
$(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt
$(CMAPDST)/to-unicode-Adobe-Japan1.marshal.gz: $(CMAPDST)
$(CONV_CMAP) -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 \
$(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt
$(CMAPDST)/to-unicode-Adobe-Korea1.marshal.gz: $(CMAPDST)
$(CONV_CMAP) -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 \
$(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt
test: cmap
$(PYTHON) -m pdfminer.arcfour
$(PYTHON) -m pdfminer.ascii85
$(PYTHON) -m pdfminer.lzw
$(PYTHON) -m pdfminer.rijndael
$(PYTHON) -m pdfminer.runlength
$(PYTHON) -m pdfminer.ccitt
$(PYTHON) -m pdfminer.psparser
cd samples && $(MAKE) test
Metadata-Version: 2.1
Name: pdfminer
Version: 20191125
Summary: PDF parser and analyzer
Home-page: http://github.com/euske/pdfminer
Author: Yusuke Shinyama
Author-email: yusuke@shinyama.jp
License: MIT
Description: # PDFMiner
PDFMiner is a text extraction tool for PDF documents.
[![Build Status](https://travis-ci.org/euske/pdfminer.svg?branch=master)](https://travis-ci.org/euske/pdfminer)
[![PyPI](https://img.shields.io/pypi/v/pdfminer)](https://pypi.org/project/pdfminer/)
**Warning**: Starting from version 20191010, PDFMiner supports **Python 3 only**.
For Python 2 support, check out
<a href="https://github.com/pdfminer/pdfminer.six">pdfminer.six</a>.
## Features:
* Pure Python (3.6 or above).
* Supports PDF-1.7. (well, almost)
* Obtains the exact location of text as well as other layout information (fonts, etc.).
* Performs automatic layout analysis.
* Can convert PDF into other formats (HTML/XML).
* Can extract an outline (TOC).
* Can extract tagged contents.
* Supports basic encryption (RC4 and AES).
* Supports various font types (Type1, TrueType, Type3, and CID).
* Supports CJK languages and vertical writing scripts.
* Has an extensible PDF parser that can be used for other purposes.
## How to Use:
1. `> pip install pdfminer`
1. `> pdf2txt.py samples/simple1.pdf`
## Command Line Syntax:
### pdf2txt.py
pdf2txt.py extracts all the texts that are rendered programmatically.
It also extracts the corresponding locations, font names, font sizes,
writing direction (horizontal or vertical) for each text segment. It
does not recognize text in images. A password needs to be provided for
restricted PDF documents.
> pdf2txt.py [-P password] [-o output] [-t text|html|xml|tag]
[-O output_dir] [-c encoding] [-s scale] [-R rotation]
[-Y normal|loose|exact] [-p pagenos] [-m maxpages]
[-S] [-C] [-n] [-A] [-V]
[-M char_margin] [-L line_margin] [-W word_margin]
[-F boxes_flow] [-d]
input.pdf ...
* `-P password` : PDF password.
* `-o output` : Output file name.
* `-t text|html|xml|tag` : Output type. (default: automatically inferred from the output file name.)
* `-O output_dir` : Output directory for extracted images.
* `-c encoding` : Output encoding. (default: utf-8)
* `-s scale` : Output scale.
* `-R rotation` : Rotates the page in degree.
* `-Y normal|loose|exact` : Specifies the layout mode. (only for HTML output.)
* `-p pagenos` : Processes certain pages only.
* `-m maxpages` : Limits the number of maximum pages to process.
* `-S` : Strips control characters.
* `-C` : Disables resource caching.
* `-n` : Disables layout analysis.
* `-A` : Applies layout analysis for all texts including figures.
* `-V` : Automatically detects vertical writing.
* `-M char_margin` : Speficies the char margin.
* `-W word_margin` : Speficies the word margin.
* `-L line_margin` : Speficies the line margin.
* `-F boxes_flow` : Speficies the box flow ratio.
* `-d` : Turns on Debug output.
### dumppdf.py
dumppdf.py is used for debugging PDFs.
It dumps all the internal contents in pseudo-XML format.
> dumppdf.py [-P password] [-a] [-p pageid] [-i objid]
[-o output] [-r|-b|-t] [-T] [-O directory] [-d]
input.pdf ...
* `-P password` : PDF password.
* `-a` : Extracts all objects.
* `-p pageid` : Extracts a Page object.
* `-i objid` : Extracts a certain object.
* `-o output` : Output file name.
* `-r` : Raw mode. Dumps the raw compressed/encoded streams.
* `-b` : Binary mode. Dumps the uncompressed/decoded streams.
* `-t` : Text mode. Dumps the streams in text format.
* `-T` : Tagged mode. Dumps the tagged contents.
* `-O output_dir` : Output directory for extracted streams.
## TODO
* Replace STRICT variable with something better.
* Improve the debugging functions.
* Use logging module instead of sys.stderr.
* Proper test cases.
* PEP-8 and PEP-257 conformance.
* Better documentation.
* Crypto stream filter support.
## Related Projects
* <a href="http://pybrary.net/pyPdf/">pyPdf</a>
* <a href="http://www.foolabs.com/xpdf/">xpdf</a>
* <a href="http://pdfbox.apache.org/">pdfbox</a>
* <a href="http://mupdf.com/">mupdf</a>
Keywords: pdf parser,pdf converter,layout analysis,text mining
Platform: UNKNOWN
Classifier: Development Status :: 4 - Beta
Classifier: Environment :: Console
Classifier: Intended Audience :: Developers
Classifier: Intended Audience :: Science/Research
Classifier: License :: OSI Approved :: MIT License
Classifier: Topic :: Text Processing
Requires-Python: >=3.6
Description-Content-Type: text/markdown
# PDFMiner
PDFMiner is a text extraction tool for PDF documents.
[![Build Status](https://travis-ci.org/euske/pdfminer.svg?branch=master)](https://travis-ci.org/euske/pdfminer)
[![PyPI](https://img.shields.io/pypi/v/pdfminer)](https://pypi.org/project/pdfminer/)
**Warning**: Starting from version 20191010, PDFMiner supports **Python 3 only**.
For Python 2 support, check out
<a href="https://github.com/pdfminer/pdfminer.six">pdfminer.six</a>.
## Features:
* Pure Python (3.6 or above).
* Supports PDF-1.7. (well, almost)
* Obtains the exact location of text as well as other layout information (fonts, etc.).
* Performs automatic layout analysis.
* Can convert PDF into other formats (HTML/XML).
* Can extract an outline (TOC).
* Can extract tagged contents.
* Supports basic encryption (RC4 and AES).
* Supports various font types (Type1, TrueType, Type3, and CID).
* Supports CJK languages and vertical writing scripts.
* Has an extensible PDF parser that can be used for other purposes.
## How to Use:
1. `> pip install pdfminer`
1. `> pdf2txt.py samples/simple1.pdf`
## Command Line Syntax:
### pdf2txt.py
pdf2txt.py extracts all the texts that are rendered programmatically.
It also extracts the corresponding locations, font names, font sizes,
writing direction (horizontal or vertical) for each text segment. It
does not recognize text in images. A password needs to be provided for
restricted PDF documents.
> pdf2txt.py [-P password] [-o output] [-t text|html|xml|tag]
[-O output_dir] [-c encoding] [-s scale] [-R rotation]
[-Y normal|loose|exact] [-p pagenos] [-m maxpages]
[-S] [-C] [-n] [-A] [-V]
[-M char_margin] [-L line_margin] [-W word_margin]
[-F boxes_flow] [-d]
input.pdf ...
* `-P password` : PDF password.
* `-o output` : Output file name.
* `-t text|html|xml|tag` : Output type. (default: automatically inferred from the output file name.)
* `-O output_dir` : Output directory for extracted images.
* `-c encoding` : Output encoding. (default: utf-8)
* `-s scale` : Output scale.
* `-R rotation` : Rotates the page in degree.
* `-Y normal|loose|exact` : Specifies the layout mode. (only for HTML output.)
* `-p pagenos` : Processes certain pages only.
* `-m maxpages` : Limits the number of maximum pages to process.
* `-S` : Strips control characters.
* `-C` : Disables resource caching.
* `-n` : Disables layout analysis.
* `-A` : Applies layout analysis for all texts including figures.
* `-V` : Automatically detects vertical writing.
* `-M char_margin` : Speficies the char margin.
* `-W word_margin` : Speficies the word margin.
* `-L line_margin` : Speficies the line margin.
* `-F boxes_flow` : Speficies the box flow ratio.
* `-d` : Turns on Debug output.
### dumppdf.py
dumppdf.py is used for debugging PDFs.
It dumps all the internal contents in pseudo-XML format.
> dumppdf.py [-P password] [-a] [-p pageid] [-i objid]
[-o output] [-r|-b|-t] [-T] [-O directory] [-d]
input.pdf ...
* `-P password` : PDF password.
* `-a` : Extracts all objects.
* `-p pageid` : Extracts a Page object.
* `-i objid` : Extracts a certain object.
* `-o output` : Output file name.
* `-r` : Raw mode. Dumps the raw compressed/encoded streams.
* `-b` : Binary mode. Dumps the uncompressed/decoded streams.
* `-t` : Text mode. Dumps the streams in text format.
* `-T` : Tagged mode. Dumps the tagged contents.
* `-O output_dir` : Output directory for extracted streams.
## TODO
* Replace STRICT variable with something better.
* Improve the debugging functions.
* Use logging module instead of sys.stderr.
* Proper test cases.
* PEP-8 and PEP-257 conformance.
* Better documentation.
* Crypto stream filter support.
## Related Projects
* <a href="http://pybrary.net/pyPdf/">pyPdf</a>
* <a href="http://www.foolabs.com/xpdf/">xpdf</a>
* <a href="http://pdfbox.apache.org/">pdfbox</a>
* <a href="http://mupdf.com/">mupdf</a>
README.txt for cmaprsrc
This directory contains Adobe CMap resources. CMaps are required
to decode text data written in CJK (Chinese, Japanese, Korean) language.
CMap resources are now available freely from Adobe web site:
http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
The following files were extracted from the downloadable tarballs:
cid2code_Adobe_CNS1.txt:
http://download.macromedia.com/pub/opensource/cmap/cmapresources_cns1-6.tar.z
cid2code_Adobe_GB1.txt:
http://download.macromedia.com/pub/opensource/cmap/cmapresources_gb1-5.tar.z
cid2code_Adobe_Japan1.txt:
http://download.macromedia.com/pub/opensource/cmap/cmapresources_japan1-6.tar.z
cid2code_Adobe_Korea1.txt:
http://download.macromedia.com/pub/opensource/cmap/cmapresources_korean1-2.tar.z
Here is the license information in the original files:
%%Copyright: -----------------------------------------------------------
%%Copyright: Copyright 1990-20xx Adobe Systems Incorporated.
%%Copyright: All rights reserved.
%%Copyright:
%%Copyright: Redistribution and use in source and binary forms, with or
%%Copyright: without modification, are permitted provided that the
%%Copyright: following conditions are met:
%%Copyright:
%%Copyright: Redistributions of source code must retain the above
%%Copyright: copyright notice, this list of conditions and the following
%%Copyright: disclaimer.
%%Copyright:
%%Copyright: Redistributions in binary form must reproduce the above
%%Copyright: copyright notice, this list of conditions and the following
%%Copyright: disclaimer in the documentation and/or other materials
%%Copyright: provided with the distribution.
%%Copyright:
%%Copyright: Neither the name of Adobe Systems Incorporated nor the names
%%Copyright: of its contributors may be used to endorse or promote
%%Copyright: products derived from this software without specific prior
%%Copyright: written permission.
%%Copyright:
%%Copyright: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
%%Copyright: CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
%%Copyright: INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
%%Copyright: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
%%Copyright: DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
%%Copyright: CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
%%Copyright: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
%%Copyright: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
%%Copyright: LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
%%Copyright: HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
%%Copyright: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
%%Copyright: OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
%%Copyright: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%%Copyright: -----------------------------------------------------------
%TGIF 4.1.45-QPL
state(0,37,100.000,0,0,0,16,1,9,1,1,2,0,1,0,1,1,'NewCenturySchlbk-Bold',1,103680,0,0,1,10,0,0,1,1,0,16,0,0,1,1,1,1,1050,1485,1,0,2880,0).
%
% @(#)$Header$
% %W%
%
unit("1 pixel/pixel").
color_info(19,65535,0,[
"magenta", 65535, 0, 65535, 65535, 0, 65535, 1,
"red", 65535, 0, 0, 65535, 0, 0, 1,
"green", 0, 65535, 0, 0, 65535, 0, 1,
"blue", 0, 0, 65535, 0, 0, 65535, 1,
"yellow", 65535, 65535, 0, 65535, 65535, 0, 1,
"pink", 65535, 49344, 52171, 65535, 49344, 52171, 1,
"cyan", 0, 65535, 65535, 0, 65535, 65535, 1,
"CadetBlue", 24415, 40606, 41120, 24415, 40606, 41120, 1,
"white", 65535, 65535, 65535, 65535, 65535, 65535, 1,
"black", 0, 0, 0, 0, 0, 0, 1,
"DarkSlateGray", 12079, 20303, 20303, 12079, 20303, 20303, 1,
"#00000000c000", 0, 0, 49344, 0, 0, 49152, 1,
"#820782070000", 33410, 33410, 0, 33287, 33287, 0, 1,
"#3cf3fbee34d2", 15420, 64507, 13364, 15603, 64494, 13522, 1,
"#3cf3fbed34d3", 15420, 64507, 13364, 15603, 64493, 13523, 1,
"#ffffa6990000", 65535, 42662, 0, 65535, 42649, 0, 1,
"#ffff0000fffe", 65535, 0, 65535, 65535, 0, 65534, 1,
"#fffe0000fffe", 65535, 0, 65535, 65534, 0, 65534, 1,
"#fffe00000000", 65535, 0, 0, 65534, 0, 0, 1
]).
script_frac("0.6").
fg_bg_colors('black','white').
dont_reencode("FFDingbests:ZapfDingbats").
objshadow_info('#c0c0c0',2,2).
page(1,"",1,'').
text('black',90,95,1,1,1,66,20,0,15,5,0,0,0,0,2,66,20,0,0,"",0,0,0,0,110,'',[
minilines(66,20,0,0,1,0,0,[
mini_line(66,15,5,0,0,0,[
str_block(0,66,15,5,0,-1,0,0,0,[
str_seg('black','Courier-Bold',1,103680,66,15,5,0,-1,0,0,0,0,0,
"U+30FC")])
])
])]).
text('black',100,285,1,1,1,66,20,3,15,5,0,0,0,0,2,66,20,0,0,"",0,0,0,0,300,'',[
minilines(66,20,0,0,1,0,0,[
mini_line(66,15,5,0,0,0,[
str_block(0,66,15,5,0,-2,0,0,0,[
str_seg('black','Courier-Bold',1,103680,66,15,5,0,-2,0,0,0,0,0,
"U+5199")])
])
])]).
text('black',400,38,2,1,1,119,30,5,12,3,0,0,0,0,2,119,30,0,0,"",0,0,0,0,50,'',[
minilines(119,30,0,0,1,0,0,[
mini_line(83,12,3,0,0,0,[
str_block(0,83,12,3,0,-3,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,83,12,3,0,-3,0,0,0,0,0,
"Adobe-Japan1")])
]),
mini_line(119,12,3,0,0,0,[
str_block(0,119,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,119,12,3,0,-1,0,0,0,0,0,
"CID:660 (horizontal)")])
])
])]).
text('black',400,118,2,1,1,114,30,8,12,3,0,0,0,0,2,114,30,0,0,"",0,0,0,0,130,'',[
minilines(114,30,0,0,1,0,0,[
mini_line(83,12,3,0,0,0,[
str_block(0,83,12,3,0,-3,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,83,12,3,0,-3,0,0,0,0,0,
"Adobe-Japan1")])
]),
mini_line(114,12,3,0,0,0,[
str_block(0,114,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,114,12,3,0,-1,0,0,0,0,0,
"CID:7891 (vertical)")])
])
])]).
text('black',400,238,2,1,1,125,30,15,12,3,0,0,0,0,2,125,30,0,0,"",0,0,0,0,250,'',[
minilines(125,30,0,0,1,0,0,[
mini_line(83,12,3,0,0,0,[
str_block(0,83,12,3,0,-3,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,83,12,3,0,-3,0,0,0,0,0,
"Adobe-Japan1")])
]),
mini_line(125,12,3,0,0,0,[
str_block(0,125,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,125,12,3,0,-1,0,0,0,0,0,
"CID:2296 (Japanese)")])
])
])]).
text('black',400,318,2,1,1,115,30,16,12,3,0,0,0,0,2,115,30,0,0,"",0,0,0,0,330,'',[
minilines(115,30,0,0,1,0,0,[
mini_line(67,12,3,0,0,0,[
str_block(0,67,12,3,0,-3,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,67,12,3,0,-3,0,0,0,0,0,
"Adobe-GB1")])
]),
mini_line(115,12,3,0,0,0,[
str_block(0,115,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,115,12,3,0,-1,0,0,0,0,0,
"CID:3967 (Chinese)")])
])
])]).
text('black',200,84,2,1,1,116,38,20,16,3,0,0,0,0,2,116,38,0,0,"",0,0,0,0,100,'',[
minilines(116,38,0,0,1,0,0,[
mini_line(70,16,3,0,0,0,[
str_block(0,70,16,3,0,-1,0,0,0,[
str_seg('black','NewCenturySchlbk-Roman',0,97920,70,16,3,0,-1,0,0,0,0,0,
"Japanese")])
]),
mini_line(116,16,3,0,0,0,[
str_block(0,116,16,3,0,-1,0,0,0,[
str_seg('black','NewCenturySchlbk-Roman',0,97920,116,16,3,0,-1,0,0,0,0,0,
"long-vowel sign")])
])
])]).
oval('black','',30,70,280,140,0,1,1,49,0,0,0,0,0,'1',0,[
]).
oval('black','',30,260,280,330,0,1,1,51,0,0,0,0,0,'1',0,[
]).
text('black',200,274,2,1,1,85,38,53,16,3,0,0,0,0,2,85,38,0,0,"",0,0,0,0,290,'',[
minilines(85,38,0,0,1,0,0,[
mini_line(61,16,3,0,0,0,[
str_block(0,61,16,3,0,-1,0,0,0,[
str_seg('black','NewCenturySchlbk-Roman',0,97920,61,16,3,0,-1,0,0,0,0,0,
"Chinese")])
]),
mini_line(85,16,3,0,0,0,[
str_block(0,85,16,3,0,-1,0,0,0,[
str_seg('black','NewCenturySchlbk-Roman',0,97920,85,16,3,0,-1,0,0,0,0,0,
"letter \"sha\"")])
])
])]).
box('black','',330,30,560,80,0,1,1,57,0,0,0,0,0,'1',0,[
]).
box('black','',330,110,560,160,0,1,1,59,0,0,0,0,0,'1',0,[
]).
box('black','',330,230,560,280,0,1,1,60,0,0,0,0,0,'1',0,[
]).
box('black','',330,310,560,360,0,1,1,61,0,0,0,0,0,'1',0,[
]).
group([
poly('black','',4,[
506,246,501,235,541,235,536,246],0,2,1,68,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]),
poly('black','',5,[
519,238,516,252,529,252,524,275,516,272],0,2,1,69,0,0,0,0,0,0,0,'2',0,0,
"00","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]),
poly('black','',2,[
501,261,541,261],0,2,1,70,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]),
poly('black','',2,[
519,244,529,244],0,2,1,71,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
])
],
76,0,0,[
]).
group([
poly('black','',3,[
519,119,524,127,524,152],0,2,1,67,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
])
],
78,0,0,[
]).
group([
poly('black','',3,[
540,57,509,57,501,49],0,2,1,66,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
])
],
80,0,0,[
]).
group([
poly('black','',4,[
506,326,501,315,541,315,536,326],0,2,1,90,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]),
poly('black','',5,[
519,318,515,332,531,332,526,355,519,352],0,2,1,89,0,0,0,0,0,0,0,'2',0,0,
"00","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]),
poly('black','',2,[
501,341,526,341],0,2,1,88,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]),
poly('black','',2,[
519,324,529,324],0,2,1,87,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
])
],
134,0,0,[
]).
poly('black','',2,[
270,90,320,70],1,3,1,158,0,0,0,0,0,0,0,'3',0,0,
"0","",[
0,12,5,0,'12','5','0'],[0,12,5,0,'12','5','0'],[
]).
poly('black','',2,[
280,110,320,130],1,3,1,159,0,0,0,0,0,0,0,'3',0,0,
"0","",[
0,12,5,0,'12','5','0'],[0,12,5,0,'12','5','0'],[
]).
poly('black','',2,[
270,280,310,250],1,3,1,160,0,0,0,0,0,0,0,'3',0,0,
"0","",[
0,12,5,0,'12','5','0'],[0,12,5,0,'12','5','0'],[
]).
poly('black','',2,[
270,300,310,330],1,3,1,161,0,0,0,0,0,0,0,'3',0,0,
"0","",[
0,12,5,0,'12','5','0'],[0,12,5,0,'12','5','0'],[
]).
This diff is collapsed.
This diff is collapsed.
%TGIF 4.2.2
state(0,37,100.000,0,0,0,16,1,9,1,1,1,0,0,2,1,1,'Helvetica-Bold',1,69120,0,0,1,10,0,0,1,1,0,16,0,0,1,1,1,1,1050,1485,1,0,2880,0).
%
% @(#)$Header$
% %W%
%
unit("1 pixel/pixel").
color_info(19,65535,0,[
"magenta", 65535, 0, 65535, 65535, 0, 65535, 1,
"red", 65535, 0, 0, 65535, 0, 0, 1,
"green", 0, 65535, 0, 0, 65535, 0, 1,
"blue", 0, 0, 65535, 0, 0, 65535, 1,
"yellow", 65535, 65535, 0, 65535, 65535, 0, 1,
"pink", 65535, 49344, 52171, 65535, 49344, 52171, 1,
"cyan", 0, 65535, 65535, 0, 65535, 65535, 1,
"CadetBlue", 24415, 40606, 41120, 24415, 40606, 41120, 1,
"white", 65535, 65535, 65535, 65535, 65535, 65535, 1,
"black", 0, 0, 0, 0, 0, 0, 1,
"DarkSlateGray", 12079, 20303, 20303, 12079, 20303, 20303, 1,
"#00000000c000", 0, 0, 49344, 0, 0, 49152, 1,
"#820782070000", 33410, 33410, 0, 33287, 33287, 0, 1,
"#3cf3fbee34d2", 15420, 64507, 13364, 15603, 64494, 13522, 1,
"#3cf3fbed34d3", 15420, 64507, 13364, 15603, 64493, 13523, 1,
"#ffffa6990000", 65535, 42662, 0, 65535, 42649, 0, 1,
"#ffff0000fffe", 65535, 0, 65535, 65535, 0, 65534, 1,
"#fffe0000fffe", 65535, 0, 65535, 65534, 0, 65534, 1,
"#fffe00000000", 65535, 0, 0, 65534, 0, 0, 1
]).
script_frac("0.6").
fg_bg_colors('black','white').
dont_reencode("FFDingbests:ZapfDingbats").
objshadow_info('#c0c0c0',2,2).
rotate_pivot(0,0,0,0).
spline_tightness(1).
page(1,"",1,'').
oval('black','',350,380,450,430,2,2,1,88,0,0,0,0,0,'2',0,[
]).
poly('black','',2,[
270,270,350,230],1,2,1,54,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
poly('black','',2,[
270,280,350,320],1,2,1,55,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
box('black','',350,100,450,150,2,2,1,2,0,0,0,0,0,'2',0,[
]).
text('black',400,118,1,1,1,84,15,3,12,3,0,0,0,0,2,84,15,0,0,"",0,0,0,0,130,'',[
minilines(84,15,0,0,1,0,0,[
mini_line(84,12,3,0,0,0,[
str_block(0,84,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,84,12,3,0,0,0,0,0,0,0,
"PDFDocument")])
])
])]).
box('black','',150,100,250,150,2,2,1,13,0,0,0,0,0,'2',0,[
]).
text('black',200,118,1,1,1,63,15,14,12,3,0,0,0,0,2,63,15,0,0,"",0,0,0,0,130,'',[
minilines(63,15,0,0,1,0,0,[
mini_line(63,12,3,0,0,0,[
str_block(0,63,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,63,12,3,0,0,0,0,0,0,0,
"PDFParser")])
])
])]).
box('black','',350,200,450,250,2,2,1,20,0,0,0,0,0,'2',0,[
]).
text('black',400,218,1,1,1,88,15,21,12,3,0,0,0,0,2,88,15,0,0,"",0,0,0,0,230,'',[
minilines(88,15,0,0,1,0,0,[
mini_line(88,12,3,0,0,0,[
str_block(0,88,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,88,12,3,0,0,0,0,0,0,0,
"PDFInterpreter")])
])
])]).
box('black','',350,300,450,350,2,2,1,23,0,0,0,0,0,'2',0,[
]).
text('black',400,318,1,1,1,65,15,24,12,3,0,0,0,0,2,65,15,0,0,"",0,0,0,0,330,'',[
minilines(65,15,0,0,1,0,0,[
mini_line(65,12,3,0,0,0,[
str_block(0,65,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,65,12,3,0,-1,0,0,0,0,0,
"PDFDevice")])
])
])]).
box('black','',180,250,280,300,2,2,1,29,0,0,0,0,0,'2',0,[
]).
text('black',230,268,1,1,1,131,15,30,12,3,2,0,0,0,2,131,15,0,0,"",0,0,0,0,280,'',[
minilines(131,15,0,0,1,0,0,[
mini_line(131,12,3,0,0,0,[
str_block(0,131,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,131,12,3,0,0,0,0,0,0,0,
"PDFResourceManager")])
])
])]).
poly('black','',2,[
250,140,350,140],1,2,1,45,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
poly('black','',2,[
350,110,250,110],1,2,1,46,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
poly('black','',2,[
400,150,400,200],1,2,1,47,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
poly('black','',2,[
400,250,400,300],1,2,1,56,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
poly('black','',2,[
400,350,400,380],0,2,1,65,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
text('black',400,388,3,1,1,44,41,71,12,3,0,-2,0,0,2,44,41,0,0,"",0,0,0,0,400,'',[
minilines(44,41,0,0,1,-2,0,[
mini_line(44,12,3,0,0,0,[
str_block(0,44,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,44,12,3,0,-1,0,0,0,0,0,
"Display")])
]),
mini_line(20,12,3,0,0,0,[
str_block(0,20,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,20,12,3,0,-1,0,0,0,0,0,
"File")])
]),
mini_line(23,12,3,0,0,0,[
str_block(0,23,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,23,12,3,0,-1,0,0,0,0,0,
"etc.")])
])
])]).
text('black',300,88,1,1,1,92,15,79,12,3,0,0,0,0,2,92,15,0,0,"",0,0,0,0,100,'',[
minilines(92,15,0,0,1,0,0,[
mini_line(92,12,3,0,0,0,[
str_block(0,92,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,92,12,3,0,-1,0,0,0,0,0,
"request objects")])
])
])]).
text('black',300,148,1,1,1,78,15,84,12,3,0,0,0,0,2,78,15,0,0,"",0,0,0,0,160,'',[
minilines(78,15,0,0,1,0,0,[
mini_line(78,12,3,0,0,0,[
str_block(0,78,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,78,12,3,0,-1,0,0,0,0,0,
"store objects")])
])
])]).
oval('black','',20,100,120,150,2,2,1,106,0,0,0,0,0,'2',0,[
]).
text('black',70,118,1,1,1,46,15,107,12,3,0,0,0,0,2,46,15,0,0,"",0,0,0,0,130,'',[
minilines(46,15,0,0,1,0,0,[
mini_line(46,12,3,0,0,0,[
str_block(0,46,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,46,12,3,0,-1,0,0,0,0,0,
"PDF file")])
])
])]).
poly('black','',2,[
120,120,150,120],0,2,1,114,0,2,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
text('black',400,158,1,1,1,84,15,115,12,3,2,0,0,0,2,84,15,0,0,"",0,0,0,0,170,'',[
minilines(84,15,0,0,1,0,0,[
mini_line(84,12,3,0,0,0,[
str_block(0,84,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,84,12,3,0,-1,0,0,0,0,0,
"page contents")])
])
])]).
text('black',400,258,1,1,1,129,15,119,12,3,2,0,0,0,2,129,15,0,0,"",0,0,0,0,270,'',[
minilines(129,15,0,0,1,0,0,[
mini_line(129,12,3,0,0,0,[
str_block(0,129,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,129,12,3,0,-1,0,0,0,0,0,
"rendering instructions")])
])
])]).
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN">
<html>
<head>
<link rel="stylesheet" type="text/css" href="style.css">
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<title>Programming with PDFMiner</title>
</head>
<body>
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Mon Mar 24 11:49:28 UTC 2014
<!-- hhmts end -->
</div>
<p>
<a href="index.html">[Back to PDFMiner homepage]</a>
<h1>Programming with PDFMiner</h1>
<p>
This page explains how to use PDFMiner as a library
from other applications.
<ul>
<li> <a href="#overview">Overview</a>
<li> <a href="#basic">Basic Usage</a>
<li> <a href="#layout">Performing Layout Analysis</a>
<li> <a href="#tocextract">Obtaining Table of Contents</a>
<li> <a href="#extend">Extending Functionality</a>
</ul>
<h2><a name="overview">Overview</a></h2>
<p>
<strong>PDF is evil.</strong> Although it is called a PDF
"document", it's nothing like Word or HTML document. PDF is more
like a graphic representation. PDF contents are just a bunch of
instructions that tell how to place the stuff at each exact
position on a display or paper. In most cases, it has no logical
structure such as sentences or paragraphs and it cannot adapt
itself when the paper size changes. PDFMiner attempts to
reconstruct some of those structures by guessing from its
positioning, but there's nothing guaranteed to work. Ugly, I
know. Again, PDF is evil.
<p>
[More technical details about the internal structure of PDF:
"How to Extract Text Contents from PDF Manually"
<a href="http://www.youtube.com/watch?v=k34wRxaxA_c">(part 1)</a>
<a href="http://www.youtube.com/watch?v=_A1M4OdNsiQ">(part 2)</a>
<a href="http://www.youtube.com/watch?v=sfV_7cWPgZE">(part 3)</a>]
<p>
Because a PDF file has such a big and complex structure,
parsing a PDF file as a whole is time and memory consuming. However,
not every part is needed for most PDF processing tasks. Therefore
PDFMiner takes a strategy of lazy parsing, which is to parse the
stuff only when it's necessary. To parse PDF files, you need to use at
least two classes: <code>PDFParser</code> and <code>PDFDocument</code>.
These two objects are associated with each other.
<code>PDFParser</code> fetches data from a file,
and <code>PDFDocument</code> stores it. You'll also need
<code>PDFPageInterpreter</code> to process the page contents
and <code>PDFDevice</code> to translate it to whatever you need.
<code>PDFResourceManager</code> is used to store
shared resources such as fonts or images.
<p>
Figure 1 shows the relationship between the classes in PDFMiner.
<div align=center>
<img src="objrel.png"><br>
<small>Figure 1. Relationships between PDFMiner classes</small>
</div>
<h2><a name="basic">Basic Usage</a></h2>
<p>
A typical way to parse a PDF file is the following:
<blockquote><pre>
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
<span class="comment"># Open a PDF file.</span>
fp = open('mypdf.pdf', 'rb')
<span class="comment"># Create a PDF parser object associated with the file object.</span>
parser = PDFParser(fp)
<span class="comment"># Create a PDF document object that stores the document structure.</span>
<span class="comment"># Supply the password for initialization.</span>
document = PDFDocument(parser, password)
<span class="comment"># Check if the document allows text extraction. If not, abort.</span>
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
<span class="comment"># Create a PDF resource manager object that stores shared resources.</span>
rsrcmgr = PDFResourceManager()
<span class="comment"># Create a PDF device object.</span>
device = PDFDevice(rsrcmgr)
<span class="comment"># Create a PDF interpreter object.</span>
interpreter = PDFPageInterpreter(rsrcmgr, device)
<span class="comment"># Process each page contained in the document.</span>
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
</pre></blockquote>
<h2><a name="layout">Performing Layout Analysis</a></h2>
<p>
Here is a typical way to use the layout analysis function:
<blockquote><pre>
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
<span class="comment"># Set parameters for analysis.</span>
laparams = LAParams()
<span class="comment"># Create a PDF page aggregator object.</span>
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
<span class="comment"># receive the LTPage object for the page.</span>
layout = device.get_result()
</pre></blockquote>
A layout analyzer returns a <code>LTPage</code> object for each page
in the PDF document. This object contains child objects within the page,
forming a tree structure. Figure 2 shows the relationship between
these objects.
<div align=center>
<img src="layout.png"><br>
<small>Figure 2. Layout objects and its tree structure</small>
</div>
<dl>
<dt> <code>LTPage</code>
<dd> Represents an entire page. May contain child objects like
<code>LTTextBox</code>, <code>LTFigure</code>, <code>LTImage</code>, <code>LTRect</code>,
<code>LTCurve</code> and <code>LTLine</code>.
<dt> <code>LTTextBox</code>
<dd> Represents a group of text chunks that can be contained in a rectangular area.
Note that this box is created by geometric analysis and does not necessarily
represents a logical boundary of the text.
It contains a list of <code>LTTextLine</code> objects.
<code>get_text()</code> method returns the text content.
<dt> <code>LTTextLine</code>
<dd> Contains a list of <code>LTChar</code> objects that represent
a single text line. The characters are aligned either horizontaly
or vertically, depending on the text's writing mode.
<code>get_text()</code> method returns the text content.
<dt> <code>LTChar</code>
<dt> <code>LTAnno</code>
<dd> Represent an actual letter in the text as a Unicode string.
Note that, while a <code>LTChar</code> object has actual boundaries,
<code>LTAnno</code> objects does not, as these are "virtual" characters,
inserted by a layout analyzer according to the relationship between two characters
(e.g. a space).
<dt> <code>LTFigure</code>
<dd> Represents an area used by PDF Form objects. PDF Forms can be used to
present figures or pictures by embedding yet another PDF document within a page.
Note that <code>LTFigure</code> objects can appear recursively.
<dt> <code>LTImage</code>
<dd> Represents an image object. Embedded images can be
in JPEG or other formats, but currently PDFMiner does not
pay much attention to graphical objects.
<dt> <code>LTLine</code>
<dd> Represents a single straight line.
Could be used for separating text or figures.
<dt> <code>LTRect</code>
<dd> Represents a rectangle.
Could be used for framing another pictures or figures.
<dt> <code>LTCurve</code>
<dd> Represents a generic Bezier curve.
</dl>
<p>
Also, check out <a href="http://denis.papathanasiou.org/?p=343">a more complete example by Denis Papathanasiou</a>.
<h2><a name="tocextract">Obtaining Table of Contents</a></h2>
<p>
PDFMiner provides functions to access the document's table of contents
("Outlines").
<blockquote><pre>
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
<span class="comment"># Open a PDF document.</span>
fp = open('mypdf.pdf', 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser, password)
<span class="comment"># Get the outlines of the document.</span>
outlines = document.get_outlines()
for (level,title,dest,a,se) in outlines:
print (level, title)
</pre></blockquote>
<p>
Some PDF documents use page numbers as destinations, while others
use page numbers and the physical location within the page. Since
PDF does not have a logical structure, and it does not provide a
way to refer to any in-page object from the outside, there's no
way to tell exactly which part of text these destinations are
referring to.
<h2><a name="extend">Extending Functionality</a></h2>
<p>
You can extend <code>PDFPageInterpreter</code> and <code>PDFDevice</code> class
in order to process them differently / obtain other information.
<hr noshade>
<address>Yusuke Shinyama</address>
</body>
blockquote { background: #eeeeee; }
h1 { border-bottom: solid black 2px; }
h2 { border-bottom: solid black 1px; }
.comment { color: darkgreen; }
Metadata-Version: 2.1
Name: pdfminer
Version: 20191125
Summary: PDF parser and analyzer
Home-page: http://github.com/euske/pdfminer
Author: Yusuke Shinyama
Author-email: yusuke@shinyama.jp
License: MIT
Description: # PDFMiner
PDFMiner is a text extraction tool for PDF documents.
[![Build Status](https://travis-ci.org/euske/pdfminer.svg?branch=master)](https://travis-ci.org/euske/pdfminer)
[![PyPI](https://img.shields.io/pypi/v/pdfminer)](https://pypi.org/project/pdfminer/)
**Warning**: Starting from version 20191010, PDFMiner supports **Python 3 only**.
For Python 2 support, check out
<a href="https://github.com/pdfminer/pdfminer.six">pdfminer.six</a>.
## Features:
* Pure Python (3.6 or above).
* Supports PDF-1.7. (well, almost)
* Obtains the exact location of text as well as other layout information (fonts, etc.).
* Performs automatic layout analysis.
* Can convert PDF into other formats (HTML/XML).
* Can extract an outline (TOC).
* Can extract tagged contents.
* Supports basic encryption (RC4 and AES).
* Supports various font types (Type1, TrueType, Type3, and CID).
* Supports CJK languages and vertical writing scripts.
* Has an extensible PDF parser that can be used for other purposes.
## How to Use:
1. `> pip install pdfminer`
1. `> pdf2txt.py samples/simple1.pdf`
## Command Line Syntax:
### pdf2txt.py
pdf2txt.py extracts all the texts that are rendered programmatically.
It also extracts the corresponding locations, font names, font sizes,
writing direction (horizontal or vertical) for each text segment. It
does not recognize text in images. A password needs to be provided for
restricted PDF documents.
> pdf2txt.py [-P password] [-o output] [-t text|html|xml|tag]
[-O output_dir] [-c encoding] [-s scale] [-R rotation]
[-Y normal|loose|exact] [-p pagenos] [-m maxpages]
[-S] [-C] [-n] [-A] [-V]
[-M char_margin] [-L line_margin] [-W word_margin]
[-F boxes_flow] [-d]
input.pdf ...
* `-P password` : PDF password.
* `-o output` : Output file name.
* `-t text|html|xml|tag` : Output type. (default: automatically inferred from the output file name.)
* `-O output_dir` : Output directory for extracted images.
* `-c encoding` : Output encoding. (default: utf-8)
* `-s scale` : Output scale.
* `-R rotation` : Rotates the page in degree.
* `-Y normal|loose|exact` : Specifies the layout mode. (only for HTML output.)
* `-p pagenos` : Processes certain pages only.
* `-m maxpages` : Limits the number of maximum pages to process.
* `-S` : Strips control characters.
* `-C` : Disables resource caching.
* `-n` : Disables layout analysis.
* `-A` : Applies layout analysis for all texts including figures.
* `-V` : Automatically detects vertical writing.
* `-M char_margin` : Speficies the char margin.
* `-W word_margin` : Speficies the word margin.
* `-L line_margin` : Speficies the line margin.
* `-F boxes_flow` : Speficies the box flow ratio.
* `-d` : Turns on Debug output.
### dumppdf.py
dumppdf.py is used for debugging PDFs.
It dumps all the internal contents in pseudo-XML format.
> dumppdf.py [-P password] [-a] [-p pageid] [-i objid]
[-o output] [-r|-b|-t] [-T] [-O directory] [-d]
input.pdf ...
* `-P password` : PDF password.
* `-a` : Extracts all objects.
* `-p pageid` : Extracts a Page object.
* `-i objid` : Extracts a certain object.
* `-o output` : Output file name.
* `-r` : Raw mode. Dumps the raw compressed/encoded streams.
* `-b` : Binary mode. Dumps the uncompressed/decoded streams.
* `-t` : Text mode. Dumps the streams in text format.
* `-T` : Tagged mode. Dumps the tagged contents.
* `-O output_dir` : Output directory for extracted streams.
## TODO
* Replace STRICT variable with something better.
* Improve the debugging functions.
* Use logging module instead of sys.stderr.
* Proper test cases.
* PEP-8 and PEP-257 conformance.
* Better documentation.
* Crypto stream filter support.
## Related Projects
* <a href="http://pybrary.net/pyPdf/">pyPdf</a>
* <a href="http://www.foolabs.com/xpdf/">xpdf</a>
* <a href="http://pdfbox.apache.org/">pdfbox</a>
* <a href="http://mupdf.com/">mupdf</a>
Keywords: pdf parser,pdf converter,layout analysis,text mining
Platform: UNKNOWN
Classifier: Development Status :: 4 - Beta
Classifier: Environment :: Console
Classifier: Intended Audience :: Developers
Classifier: Intended Audience :: Science/Research
Classifier: License :: OSI Approved :: MIT License
Classifier: Topic :: Text Processing
Requires-Python: >=3.6
Description-Content-Type: text/markdown
LICENSE
MANIFEST.in
Makefile
README.md
setup.py
cmaprsrc/README.txt
cmaprsrc/cid2code_Adobe_CNS1.txt
cmaprsrc/cid2code_Adobe_GB1.txt
cmaprsrc/cid2code_Adobe_Japan1.txt
cmaprsrc/cid2code_Adobe_Korea1.txt
docs/cid.obj
docs/cid.png
docs/index.html
docs/layout.obj
docs/layout.png
docs/objrel.obj
docs/objrel.png
docs/programming.html
docs/style.css
pdfminer/Makefile
pdfminer/__init__.py
pdfminer/arcfour.py
pdfminer/ascii85.py
pdfminer/ccitt.py
pdfminer/cmapdb.py
pdfminer/converter.py
pdfminer/encodingdb.py
pdfminer/fontmetrics.py
pdfminer/glyphlist.py
pdfminer/image.py
pdfminer/latin_enc.py
pdfminer/layout.py
pdfminer/lzw.py
pdfminer/pdfcolor.py
pdfminer/pdfdevice.py
pdfminer/pdfdocument.py
pdfminer/pdffont.py
pdfminer/pdfinterp.py
pdfminer/pdfpage.py
pdfminer/pdfparser.py
pdfminer/pdftypes.py
pdfminer/psparser.py
pdfminer/rijndael.py
pdfminer/runlength.py
pdfminer/utils.py
pdfminer.egg-info/PKG-INFO
pdfminer.egg-info/SOURCES.txt
pdfminer.egg-info/dependency_links.txt
pdfminer.egg-info/requires.txt
pdfminer.egg-info/top_level.txt
samples/Makefile
samples/README
samples/jo.html.ref
samples/jo.pdf
samples/jo.tex
samples/jo.txt.ref
samples/jo.xml.ref
samples/simple1.html.ref
samples/simple1.pdf
samples/simple1.txt.ref
samples/simple1.xml.ref
samples/simple2.html.ref
samples/simple2.pdf
samples/simple2.txt.ref
samples/simple2.xml.ref
samples/simple3.html.ref
samples/simple3.pdf
samples/simple3.txt.ref
samples/simple3.xml.ref
samples/encryption/Makefile
samples/encryption/aes-128-m.pdf
samples/encryption/aes-128-m.xml
samples/encryption/aes-128.pdf
samples/encryption/aes-128.xml
samples/encryption/aes-256-m.pdf
samples/encryption/aes-256-m.xml
samples/encryption/aes-256.pdf
samples/encryption/aes-256.xml
samples/encryption/base.pdf
samples/encryption/base.xml
samples/encryption/rc4-128.pdf
samples/encryption/rc4-128.xml
samples/encryption/rc4-40.pdf
samples/encryption/rc4-40.xml
samples/nonfree/dmca.html.ref
samples/nonfree/dmca.pdf
samples/nonfree/dmca.txt.ref
samples/nonfree/dmca.xml.ref
samples/nonfree/f1040nr.html.ref
samples/nonfree/f1040nr.pdf
samples/nonfree/f1040nr.txt.ref
samples/nonfree/f1040nr.xml.ref
samples/nonfree/i1040nr.html.ref
samples/nonfree/i1040nr.pdf
samples/nonfree/i1040nr.txt.ref
samples/nonfree/i1040nr.xml.ref
samples/nonfree/kampo.html.ref
samples/nonfree/kampo.pdf
samples/nonfree/kampo.txt.ref
samples/nonfree/kampo.xml.ref
samples/nonfree/naacl06-shinyama.html.ref
samples/nonfree/naacl06-shinyama.pdf
samples/nonfree/naacl06-shinyama.txt.ref
samples/nonfree/naacl06-shinyama.xml.ref
samples/nonfree/nlp2004slides.html.ref
samples/nonfree/nlp2004slides.pdf
samples/nonfree/nlp2004slides.txt.ref
samples/nonfree/nlp2004slides.xml.ref
tools/Makefile
tools/conv_afm.py
tools/conv_cmap.py
tools/conv_glyphlist.py
tools/dumppdf.py
tools/latin2ascii.py
tools/pdf2html.cgi
tools/pdf2txt.py
tools/prof.py
tools/runapp.py
\ No newline at end of file
# Makefile for pdfminer
RM=rm -f
all:
clean:
-$(RM) *.pyc *.pyo
-$(RM) -r __pycache__
cd cmap && make clean
#!/usr/bin/env python
__version__ = '20191125'
if __name__ == '__main__':
print(__version__)
#!/usr/bin/env python
""" Python implementation of Arcfour encryption algorithm.
This code is in the public domain.
"""
## Arcfour
##
class Arcfour:
"""
>>> Arcfour(b'Key').process(b'Plaintext').hex()
'bbf316e8d940af0ad3'
>>> Arcfour(b'Wiki').process(b'pedia').hex()
'1021bf0420'
>>> Arcfour(b'Secret').process(b'Attack at dawn').hex()
'45a01f645fc35b383552544b9bf5'
"""
def __init__(self, key):
s = list(range(256))
j = 0
klen = len(key)
for i in range(256):
j = (j + s[i] + key[i % klen]) % 256
(s[i], s[j]) = (s[j], s[i])
self.s = s
(self.i, self.j) = (0, 0)
return
def process(self, data):
(i, j) = (self.i, self.j)
s = self.s
r = []
for c in data:
i = (i+1) % 256
j = (j+s[i]) % 256
(s[i], s[j]) = (s[j], s[i])
k = s[(s[i]+s[j]) % 256]
r.append(c ^ k)
(self.i, self.j) = (i, j)
return bytes(r)
encrypt = decrypt = process
new = Arcfour
# test
if __name__ == '__main__':
import doctest
print('pdfminer.arcfour:', doctest.testmod())
#!/usr/bin/env python
""" Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
This code is in the public domain.
"""
import re
import struct
# ascii85decode(data)
def ascii85decode(data):
"""
In ASCII85 encoding, every four bytes are encoded with five ASCII
letters, using 85 different types of characters (as 256**4 < 85**5).
When the length of the original bytes is not a multiple of 4, a special
rule is used for round up.
The Adobe's ASCII85 implementation is slightly different from
its original in handling the last characters.
The sample string is taken from:
http://en.wikipedia.org/w/index.php?title=Ascii85
>>> ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q')
b'Man is distinguished'
>>> ascii85decode(b'E,9)oF*2M7/c~>')
b'pleasure.'
"""
n = b = 0
out = b''
for c in data:
if 33 <= c and c <= 117: # b'!' <= c and c <= b'u'
n += 1
b = b*85+(c-33)
if n == 5:
out += struct.pack('>L', b)
n = b = 0
elif c == 122: # b'z'
assert n == 0
out += b'\0\0\0\0'
elif c == 126: # b'~'
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack('>L', b)[:n-1]
break
return out
# asciihexdecode(data)
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
def asciihexdecode(data):
"""
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
ASCIIHexDecode filter produces one byte of binary data. All white-space
characters are ignored. A right angle bracket character (>) indicates
EOD. Any other characters will cause an error. If the filter encounters
the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit.
>>> asciihexdecode(b'61 62 2e6364 65')
b'ab.cde'
>>> asciihexdecode(b'61 62 2e6364 657>')
b'ab.cdep'
>>> asciihexdecode(b'7>')
b'p'
"""
data = data.decode('latin1')
out = [ int(hx,16) for hx in hex_re.findall(data) ]
m = trail_re.search(data)
if m:
out.append(int(m.group(1),16) << 4)
return bytes(out)
if __name__ == '__main__':
import doctest
print('pdfminer.ascii85', doctest.testmod())
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/env python
import re
from .psparser import PSLiteral
from .glyphlist import glyphname2unicode
from .latin_enc import ENCODING
STRIP_NAME = re.compile(r'[0-9]+')
## name2unicode
##
def name2unicode(name):
"""Converts Adobe glyph names to Unicode numbers."""
if name in glyphname2unicode:
return glyphname2unicode[name]
m = STRIP_NAME.search(name)
if not m:
raise KeyError(name)
return chr(int(m.group(0)))
## EncodingDB
##
class EncodingDB:
std2unicode = {}
mac2unicode = {}
win2unicode = {}
pdf2unicode = {}
for (name, std, mac, win, pdf) in ENCODING:
c = name2unicode(name)
if std:
std2unicode[std] = c
if mac:
mac2unicode[mac] = c
if win:
win2unicode[win] = c
if pdf:
pdf2unicode[pdf] = c
encodings = {
'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode,
}
@classmethod
def get_encoding(klass, name, diff=None):
cid2unicode = klass.encodings.get(name, klass.std2unicode)
if diff:
cid2unicode = cid2unicode.copy()
cid = 0
for x in diff:
if isinstance(x, int):
cid = x
elif isinstance(x, PSLiteral):
try:
cid2unicode[cid] = name2unicode(x.name)
except KeyError:
pass
cid += 1
return cid2unicode
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/env python
import struct
import os
import os.path
from io import BytesIO
from .pdftypes import LITERALS_DCT_DECODE
from .pdfcolor import LITERAL_DEVICE_GRAY
from .pdfcolor import LITERAL_DEVICE_RGB
from .pdfcolor import LITERAL_DEVICE_CMYK
def align32(x):
return ((x+3)//4)*4
## BMPWriter
##
class BMPWriter:
def __init__(self, fp, bits, width, height):
self.fp = fp
self.bits = bits
self.width = width
self.height = height
if bits == 1:
ncols = 2
elif bits == 8:
ncols = 256
elif bits == 24:
ncols = 0
else:
raise ValueError(bits)
self.linesize = align32((self.width*self.bits+7)//8)
self.datasize = self.linesize * self.height
headersize = 14+40+ncols*4
info = struct.pack('<IiiHHIIIIII', 40, self.width, self.height, 1, self.bits, 0, self.datasize, 0, 0, ncols, 0)
assert len(info) == 40, len(info)
header = struct.pack('<ccIHHI', b'B', b'M', headersize+self.datasize, 0, 0, headersize)
assert len(header) == 14, len(header)
self.fp.write(header)
self.fp.write(info)
if ncols == 2:
# B&W color table
for i in (0, 255):
self.fp.write(struct.pack('BBBx', i, i, i))
elif ncols == 256:
# grayscale color table
for i in range(256):
self.fp.write(struct.pack('BBBx', i, i, i))
self.pos0 = self.fp.tell()
self.pos1 = self.pos0 + self.datasize
return
def write_line(self, y, data):
self.fp.seek(self.pos1 - (y+1)*self.linesize)
self.fp.write(data)
return
## ImageWriter
##
class ImageWriter:
def __init__(self, outdir):
self.outdir = outdir
if not os.path.exists(self.outdir):
os.makedirs(self.outdir)
return
def export_image(self, image):
stream = image.stream
filters = stream.get_filters()
(width, height) = image.srcsize
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
ext = '.jpg'
elif (image.bits == 1 or
image.bits == 8 and image.colorspace in (LITERAL_DEVICE_RGB, LITERAL_DEVICE_GRAY)):
ext = '.%dx%d.bmp' % (width, height)
else:
ext = '.%d.%dx%d.img' % (image.bits, width, height)
name = image.name+ext
path = os.path.join(self.outdir, name)
with open(path, 'wb') as fp:
if ext == '.jpg':
raw_data = stream.get_rawdata()
if LITERAL_DEVICE_CMYK in image.colorspace:
from PIL import Image
from PIL import ImageChops
ifp = BytesIO(raw_data)
i = Image.open(ifp)
i = ImageChops.invert(i)
i = i.convert('RGB')
i.save(fp, 'JPEG')
else:
fp.write(raw_data)
elif image.bits == 1:
bmp = BMPWriter(fp, 1, width, height)
data = stream.get_data()
i = 0
width = (width+7)//8
for y in range(height):
bmp.write_line(y, data[i:i+width])
i += width
elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_RGB:
bmp = BMPWriter(fp, 24, width, height)
data = stream.get_data()
i = 0
width = width*3
for y in range(height):
bmp.write_line(y, data[i:i+width])
i += width
elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_GRAY:
bmp = BMPWriter(fp, 8, width, height)
data = stream.get_data()
i = 0
for y in range(height):
bmp.write_line(y, data[i:i+width])
i += width
else:
fp.write(stream.get_data())
return name
#!/usr/bin/env python
""" Standard encoding tables used in PDF.
This table is extracted from PDF Reference Manual 1.6, pp.925
"D.1 Latin Character Set and Encodings"
"""
ENCODING = [
# (name, std, mac, win, pdf)
('A', 65, 65, 65, 65),
('AE', 225, 174, 198, 198),
('Aacute', None, 231, 193, 193),
('Acircumflex', None, 229, 194, 194),
('Adieresis', None, 128, 196, 196),
('Agrave', None, 203, 192, 192),
('Aring', None, 129, 197, 197),
('Atilde', None, 204, 195, 195),
('B', 66, 66, 66, 66),
('C', 67, 67, 67, 67),
('Ccedilla', None, 130, 199, 199),
('D', 68, 68, 68, 68),
('E', 69, 69, 69, 69),
('Eacute', None, 131, 201, 201),
('Ecircumflex', None, 230, 202, 202),
('Edieresis', None, 232, 203, 203),
('Egrave', None, 233, 200, 200),
('Eth', None, None, 208, 208),
('Euro', None, None, 128, 160),
('F', 70, 70, 70, 70),
('G', 71, 71, 71, 71),
('H', 72, 72, 72, 72),
('I', 73, 73, 73, 73),
('Iacute', None, 234, 205, 205),
('Icircumflex', None, 235, 206, 206),
('Idieresis', None, 236, 207, 207),
('Igrave', None, 237, 204, 204),
('J', 74, 74, 74, 74),
('K', 75, 75, 75, 75),
('L', 76, 76, 76, 76),
('Lslash', 232, None, None, 149),
('M', 77, 77, 77, 77),
('N', 78, 78, 78, 78),
('Ntilde', None, 132, 209, 209),
('O', 79, 79, 79, 79),
('OE', 234, 206, 140, 150),
('Oacute', None, 238, 211, 211),
('Ocircumflex', None, 239, 212, 212),
('Odieresis', None, 133, 214, 214),
('Ograve', None, 241, 210, 210),
('Oslash', 233, 175, 216, 216),
('Otilde', None, 205, 213, 213),
('P', 80, 80, 80, 80),
('Q', 81, 81, 81, 81),
('R', 82, 82, 82, 82),
('S', 83, 83, 83, 83),
('Scaron', None, None, 138, 151),
('T', 84, 84, 84, 84),
('Thorn', None, None, 222, 222),
('U', 85, 85, 85, 85),
('Uacute', None, 242, 218, 218),
('Ucircumflex', None, 243, 219, 219),
('Udieresis', None, 134, 220, 220),
('Ugrave', None, 244, 217, 217),
('V', 86, 86, 86, 86),
('W', 87, 87, 87, 87),
('X', 88, 88, 88, 88),
('Y', 89, 89, 89, 89),
('Yacute', None, None, 221, 221),
('Ydieresis', None, 217, 159, 152),
('Z', 90, 90, 90, 90),
('Zcaron', None, None, 142, 153),
('a', 97, 97, 97, 97),
('aacute', None, 135, 225, 225),
('acircumflex', None, 137, 226, 226),
('acute', 194, 171, 180, 180),
('adieresis', None, 138, 228, 228),
('ae', 241, 190, 230, 230),
('agrave', None, 136, 224, 224),
('ampersand', 38, 38, 38, 38),
('aring', None, 140, 229, 229),
('asciicircum', 94, 94, 94, 94),
('asciitilde', 126, 126, 126, 126),
('asterisk', 42, 42, 42, 42),
('at', 64, 64, 64, 64),
('atilde', None, 139, 227, 227),
('b', 98, 98, 98, 98),
('backslash', 92, 92, 92, 92),
('bar', 124, 124, 124, 124),
('braceleft', 123, 123, 123, 123),
('braceright', 125, 125, 125, 125),
('bracketleft', 91, 91, 91, 91),
('bracketright', 93, 93, 93, 93),
('breve', 198, 249, None, 24),
('brokenbar', None, None, 166, 166),
('bullet', 183, 165, 149, 128),
('c', 99, 99, 99, 99),
('caron', 207, 255, None, 25),
('ccedilla', None, 141, 231, 231),
('cedilla', 203, 252, 184, 184),
('cent', 162, 162, 162, 162),
('circumflex', 195, 246, 136, 26),
('colon', 58, 58, 58, 58),
('comma', 44, 44, 44, 44),
('copyright', None, 169, 169, 169),
('currency', 168, 219, 164, 164),
('d', 100, 100, 100, 100),
('dagger', 178, 160, 134, 129),
('daggerdbl', 179, 224, 135, 130),
('degree', None, 161, 176, 176),
('dieresis', 200, 172, 168, 168),
('divide', None, 214, 247, 247),
('dollar', 36, 36, 36, 36),
('dotaccent', 199, 250, None, 27),
('dotlessi', 245, 245, None, 154),
('e', 101, 101, 101, 101),
('eacute', None, 142, 233, 233),
('ecircumflex', None, 144, 234, 234),
('edieresis', None, 145, 235, 235),
('egrave', None, 143, 232, 232),
('eight', 56, 56, 56, 56),
('ellipsis', 188, 201, 133, 131),
('emdash', 208, 209, 151, 132),
('endash', 177, 208, 150, 133),
('equal', 61, 61, 61, 61),
('eth', None, None, 240, 240),
('exclam', 33, 33, 33, 33),
('exclamdown', 161, 193, 161, 161),
('f', 102, 102, 102, 102),
('fi', 174, 222, None, 147),
('five', 53, 53, 53, 53),
('fl', 175, 223, None, 148),
('florin', 166, 196, 131, 134),
('four', 52, 52, 52, 52),
('fraction', 164, 218, None, 135),
('g', 103, 103, 103, 103),
('germandbls', 251, 167, 223, 223),
('grave', 193, 96, 96, 96),
('greater', 62, 62, 62, 62),
('guillemotleft', 171, 199, 171, 171),
('guillemotright', 187, 200, 187, 187),
('guilsinglleft', 172, 220, 139, 136),
('guilsinglright', 173, 221, 155, 137),
('h', 104, 104, 104, 104),
('hungarumlaut', 205, 253, None, 28),
('hyphen', 45, 45, 45, 45),
('i', 105, 105, 105, 105),
('iacute', None, 146, 237, 237),
('icircumflex', None, 148, 238, 238),
('idieresis', None, 149, 239, 239),
('igrave', None, 147, 236, 236),
('j', 106, 106, 106, 106),
('k', 107, 107, 107, 107),
('l', 108, 108, 108, 108),
('less', 60, 60, 60, 60),
('logicalnot', None, 194, 172, 172),
('lslash', 248, None, None, 155),
('m', 109, 109, 109, 109),
('macron', 197, 248, 175, 175),
('minus', None, None, None, 138),
('mu', None, 181, 181, 181),
('multiply', None, None, 215, 215),
('n', 110, 110, 110, 110),
('nbspace', None, 202, 160, None),
('nine', 57, 57, 57, 57),
('ntilde', None, 150, 241, 241),
('numbersign', 35, 35, 35, 35),
('o', 111, 111, 111, 111),
('oacute', None, 151, 243, 243),
('ocircumflex', None, 153, 244, 244),
('odieresis', None, 154, 246, 246),
('oe', 250, 207, 156, 156),
('ogonek', 206, 254, None, 29),
('ograve', None, 152, 242, 242),
('one', 49, 49, 49, 49),
('onehalf', None, None, 189, 189),
('onequarter', None, None, 188, 188),
('onesuperior', None, None, 185, 185),
('ordfeminine', 227, 187, 170, 170),
('ordmasculine', 235, 188, 186, 186),
('oslash', 249, 191, 248, 248),
('otilde', None, 155, 245, 245),
('p', 112, 112, 112, 112),
('paragraph', 182, 166, 182, 182),
('parenleft', 40, 40, 40, 40),
('parenright', 41, 41, 41, 41),
('percent', 37, 37, 37, 37),
('period', 46, 46, 46, 46),
('periodcentered', 180, 225, 183, 183),
('perthousand', 189, 228, 137, 139),
('plus', 43, 43, 43, 43),
('plusminus', None, 177, 177, 177),
('q', 113, 113, 113, 113),
('question', 63, 63, 63, 63),
('questiondown', 191, 192, 191, 191),
('quotedbl', 34, 34, 34, 34),
('quotedblbase', 185, 227, 132, 140),
('quotedblleft', 170, 210, 147, 141),
('quotedblright', 186, 211, 148, 142),
('quoteleft', 96, 212, 145, 143),
('quoteright', 39, 213, 146, 144),
('quotesinglbase', 184, 226, 130, 145),
('quotesingle', 169, 39, 39, 39),
('r', 114, 114, 114, 114),
('registered', None, 168, 174, 174),
('ring', 202, 251, None, 30),
('s', 115, 115, 115, 115),
('scaron', None, None, 154, 157),
('section', 167, 164, 167, 167),
('semicolon', 59, 59, 59, 59),
('seven', 55, 55, 55, 55),
('six', 54, 54, 54, 54),
('slash', 47, 47, 47, 47),
('space', 32, 32, 32, 32),
('sterling', 163, 163, 163, 163),
('t', 116, 116, 116, 116),
('thorn', None, None, 254, 254),
('three', 51, 51, 51, 51),
('threequarters', None, None, 190, 190),
('threesuperior', None, None, 179, 179),
('tilde', 196, 247, 152, 31),
('trademark', None, 170, 153, 146),
('two', 50, 50, 50, 50),
('twosuperior', None, None, 178, 178),
('u', 117, 117, 117, 117),
('uacute', None, 156, 250, 250),
('ucircumflex', None, 158, 251, 251),
('udieresis', None, 159, 252, 252),
('ugrave', None, 157, 249, 249),
('underscore', 95, 95, 95, 95),
('v', 118, 118, 118, 118),
('w', 119, 119, 119, 119),
('x', 120, 120, 120, 120),
('y', 121, 121, 121, 121),
('yacute', None, None, 253, 253),
('ydieresis', None, 216, 255, 255),
('yen', 165, 180, 165, 165),
('z', 122, 122, 122, 122),
('zcaron', None, None, 158, 158),
('zero', 48, 48, 48, 48),
]
This diff is collapsed.
#!/usr/bin/env python
from io import BytesIO
class CorruptDataError(Exception):
pass
## LZWDecoder
##
class LZWDecoder:
def __init__(self, fp):
self.fp = fp
self.buff = 0
self.bpos = 8
self.nbits = 9
self.table = None
self.prevbuf = None
return
def readbits(self, bits):
v = 0
while 1:
# the number of remaining bits we can get from the current buffer.
r = 8-self.bpos
if bits <= r:
# |-----8-bits-----|
# |-bpos-|-bits-| |
# | |----r----|
v = (v << bits) | ((self.buff >> (r-bits)) & ((1 << bits)-1))
self.bpos += bits
break
else:
# |-----8-bits-----|
# |-bpos-|---bits----...
# | |----r----|
v = (v << r) | (self.buff & ((1 << r)-1))
bits -= r
x = self.fp.read(1)
if not x:
raise EOFError
self.buff = x[0]
self.bpos = 0
return v
def feed(self, code):
x = b''
if code == 256:
self.table = [bytes([c]) for c in range(256)] # 0-255
self.table.append(None) # 256
self.table.append(None) # 257
self.prevbuf = b''
self.nbits = 9
elif code == 257:
pass
elif not self.prevbuf:
x = self.prevbuf = self.table[code]
else:
if code < len(self.table):
x = self.table[code]
self.table.append(self.prevbuf+x[:1])
elif code == len(self.table):
self.table.append(self.prevbuf+self.prevbuf[:1])
x = self.table[code]
else:
raise CorruptDataError
l = len(self.table)
if l == 511:
self.nbits = 10
elif l == 1023:
self.nbits = 11
elif l == 2047:
self.nbits = 12
self.prevbuf = x
return x
def run(self):
while 1:
try:
code = self.readbits(self.nbits)
except EOFError:
break
try:
x = self.feed(code)
except CorruptDataError:
# just ignore corrupt data and stop yielding there
break
yield x
#logging.debug('nbits=%d, code=%d, output=%r, table=%r' %
# (self.nbits, code, x, self.table[258:]))
return
# lzwdecode
def lzwdecode(data):
"""
>>> lzwdecode(bytes.fromhex('800b6050220c0c8501'))
b'-----A---B'
"""
fp = BytesIO(data)
return b''.join(LZWDecoder(fp).run())
if __name__ == '__main__':
import doctest
print('pdfminer.lzw', doctest.testmod())
#!/usr/bin/env python
from .psparser import LIT
## PDFColorSpace
##
LITERAL_DEVICE_GRAY = LIT('DeviceGray')
LITERAL_DEVICE_RGB = LIT('DeviceRGB')
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
class PDFColorSpace:
def __init__(self, name, ncomponents):
self.name = name
self.ncomponents = ncomponents
return
def __repr__(self):
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
PREDEFINED_COLORSPACE = dict(
(name, PDFColorSpace(name, n)) for (name, n) in {
'CalRGB': 3,
'CalGray': 1,
'Lab': 3,
'DeviceRGB': 3,
'DeviceCMYK': 4,
'DeviceGray': 1,
'Separation': 1,
'Indexed': 1,
'Pattern': 1,
}.items())
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/env python
#
# RunLength decoder (Adobe version) implementation based on PDF Reference
# version 1.4 section 3.3.4.
#
# * public domain *
#
def rldecode(data):
r"""
RunLength decoder (Adobe version) implementation based on PDF Reference
version 1.4 section 3.3.4:
The RunLengthDecode filter decodes data that has been encoded in a
simple byte-oriented format based on run length. The encoded data
is a sequence of runs, where each run consists of a length byte
followed by 1 to 128 bytes of data. If the length byte is in the
range 0 to 127, the following length + 1 (1 to 128) bytes are
copied literally during decompression. If length is in the range
129 to 255, the following single byte is to be copied 257 - length
(2 to 128) times during decompression. A length value of 128
denotes EOD.
>>> s = b'\x05123456\xfa7\x04abcde\x80junk'
>>> rldecode(s)
b'1234567777777abcde'
"""
decoded = b''
i = 0
while i < len(data):
#print('data[%d]=:%d:' % (i,ord(data[i])))
length = data[i]
if length == 128:
break
if length >= 0 and length < 128:
run = data[i+1:(i+1)+(length+1)]
#print('length=%d, run=%s' % (length+1,run))
decoded += run
i = (i+1) + (length+1)
if length > 128:
run = data[i+1:i+2]*(257-length)
#print('length=%d, run=%s' % (257-length,run))
decoded += run
i = (i+1) + 1
return decoded
if __name__ == '__main__':
import doctest
print('pdfminer.runlength', doctest.testmod())
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# GNUMakefile for test
RM=rm -f
CMP=:
PYTHON=python2
PDF2TXT=PYTHONPATH=../.. $(PYTHON) ../../tools/pdf2txt.py
XMLS= \
rc4-40.xml \
rc4-128.xml \
aes-128.xml \
aes-128-m.xml \
aes-256.xml \
aes-256-m.xml \
all: xmls
test:
$(MAKE) all CMP=cmp
clean:
-$(RM) $(XMLS)
xmls: $(XMLS)
.SUFFIXES: .pdf .xml
.pdf.xml:
$(PDF2TXT) -p1 -V -t xml -P foo -o $@ $<
$(CMP) $@ base.xml
$(PDF2TXT) -p1 -V -t xml -P baz -o $@ $<
$(CMP) $@ base.xml
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment