test cases updated

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@282 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-12-25 08:41:11 +00:00
parent 84ed94aec0
commit 5d98a27d9c
21 changed files with 5742 additions and 24161 deletions

View File

@ -5,7 +5,7 @@ RM=rm -f
CMP=:
PYTHON=python2
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -Dx -p1
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1
HTMLS=$(HTMLS_FREE) $(HTMLS_NONFREE)
HTMLS_FREE= \
@ -49,8 +49,7 @@ XMLS_NONFREE= \
nonfree/naacl06-shinyama.xml \
nonfree/nlp2004slides.xml
all:
$(MAKE) test CMP=cmp
all: test
test: htmls texts xmls

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -14,6 +14,16 @@ number of other significant copyright-related issues.
The DMCA is divided into five titles:
!
!
!
!
!
Title I, the “WIPO Copyright and Performances and Phonograms
Treaties Implementation Act of 1998,” implements the WIPO
treaties.
@ -32,16 +42,6 @@ of rights in motion pictures.
Title V, the “Vessel Hull Design Protection Act,” creates a new form
of protection for the design of vessel hulls.
!
!
!
!
!
This memorandum summarizes briefly each title of the DMCA. It provides
merely an overview of the laws provisions; for purposes of length and readability a
significant amount of detail has been omitted. A complete understanding of any

View File

@ -568,7 +568,42 @@
</text>
</textline>
</textbox>
<textbox id="5" bbox="180.000,235.452,504.108,464.124">
<textbox id="5" bbox="144.000,450.000,152.004,462.480">
<textline bbox="144.000,450.000,152.004,462.480">
<text font="ELCKGH+WPTypographicSymbols" bbox="144.000,450.000,152.004,462.480" size="12.480">!</text>
<text>
</text>
</textline>
</textbox>
<textbox id="6" bbox="144.000,409.680,152.004,422.160">
<textline bbox="144.000,409.680,152.004,422.160">
<text font="ELCKGH+WPTypographicSymbols" bbox="144.000,409.680,152.004,422.160" size="12.480">!</text>
<text>
</text>
</textline>
</textbox>
<textbox id="7" bbox="144.000,369.360,152.004,381.840">
<textline bbox="144.000,369.360,152.004,381.840">
<text font="ELCKGH+WPTypographicSymbols" bbox="144.000,369.360,152.004,381.840" size="12.480">!</text>
<text>
</text>
</textline>
</textbox>
<textbox id="8" bbox="144.000,329.040,152.004,341.520">
<textline bbox="144.000,329.040,152.004,341.520">
<text font="ELCKGH+WPTypographicSymbols" bbox="144.000,329.040,152.004,341.520" size="12.480">!</text>
<text>
</text>
</textline>
</textbox>
<textbox id="9" bbox="144.000,248.400,152.004,260.880">
<textline bbox="144.000,248.400,152.004,260.880">
<text font="ELCKGH+WPTypographicSymbols" bbox="144.000,248.400,152.004,260.880" size="12.480">!</text>
<text>
</text>
</textline>
</textbox>
<textbox id="10" bbox="180.000,235.452,504.108,464.124">
<textline bbox="180.000,450.492,503.985,464.124">
<text font="Garamond" bbox="180.000,450.492,187.380,464.064" size="13.572">T</text>
<text font="Garamond" bbox="187.440,450.492,190.188,464.064" size="13.572">i</text>
@ -1669,41 +1704,6 @@
</text>
</textline>
</textbox>
<textbox id="6" bbox="144.000,450.000,152.004,462.480">
<textline bbox="144.000,450.000,152.004,462.480">
<text font="ELCKGH+WPTypographicSymbols" bbox="144.000,450.000,152.004,462.480" size="12.480">!</text>
<text>
</text>
</textline>
</textbox>
<textbox id="7" bbox="144.000,409.680,152.004,422.160">
<textline bbox="144.000,409.680,152.004,422.160">
<text font="ELCKGH+WPTypographicSymbols" bbox="144.000,409.680,152.004,422.160" size="12.480">!</text>
<text>
</text>
</textline>
</textbox>
<textbox id="8" bbox="144.000,369.360,152.004,381.840">
<textline bbox="144.000,369.360,152.004,381.840">
<text font="ELCKGH+WPTypographicSymbols" bbox="144.000,369.360,152.004,381.840" size="12.480">!</text>
<text>
</text>
</textline>
</textbox>
<textbox id="9" bbox="144.000,329.040,152.004,341.520">
<textline bbox="144.000,329.040,152.004,341.520">
<text font="ELCKGH+WPTypographicSymbols" bbox="144.000,329.040,152.004,341.520" size="12.480">!</text>
<text>
</text>
</textline>
</textbox>
<textbox id="10" bbox="144.000,248.400,152.004,260.880">
<textline bbox="144.000,248.400,152.004,260.880">
<text font="ELCKGH+WPTypographicSymbols" bbox="144.000,248.400,152.004,260.880" size="12.480">!</text>
<text>
</text>
</textline>
</textbox>
<textbox id="11" bbox="108.000,168.360,504.108,222.144">
<textline bbox="144.000,208.572,504.060,222.144">
<text font="Garamond" bbox="144.000,208.572,151.380,222.144" size="13.572">T</text>
@ -2194,20 +2194,20 @@
<textgroup bbox="144.000,235.452,504.108,490.944">
<textbox id="4" bbox="144.000,477.372,323.881,490.944" />
<textgroup bbox="144.000,235.452,504.108,464.124">
<textbox id="5" bbox="180.000,235.452,504.108,464.124" />
<textgroup bbox="144.000,248.400,152.004,462.480">
<textgroup bbox="144.000,329.040,152.004,462.480">
<textgroup bbox="144.000,369.360,152.004,462.480">
<textgroup bbox="144.000,409.680,152.004,462.480">
<textbox id="6" bbox="144.000,450.000,152.004,462.480" />
<textbox id="7" bbox="144.000,409.680,152.004,422.160" />
<textbox id="5" bbox="144.000,450.000,152.004,462.480" />
<textbox id="6" bbox="144.000,409.680,152.004,422.160" />
</textgroup>
<textbox id="8" bbox="144.000,369.360,152.004,381.840" />
<textbox id="7" bbox="144.000,369.360,152.004,381.840" />
</textgroup>
<textbox id="9" bbox="144.000,329.040,152.004,341.520" />
<textbox id="8" bbox="144.000,329.040,152.004,341.520" />
</textgroup>
<textbox id="10" bbox="144.000,248.400,152.004,260.880" />
<textbox id="9" bbox="144.000,248.400,152.004,260.880" />
</textgroup>
<textbox id="10" bbox="180.000,235.452,504.108,464.124" />
</textgroup>
</textgroup>
<textbox id="11" bbox="108.000,168.360,504.108,222.144" />

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@ OMB No. 1545-0074
Identifying number (see page 8)
I
I
Check if:
@ -18,11 +18,11 @@ Type of entry visa (see page 8)
7 a
7a
Yourself
7 b
7b
Spouse
@ -45,10 +45,8 @@ on 7a and 7b
No. of children on
7c who:
lived with you
● did not live with
● lived with you
● did not live with
you due to divorce
or separation
@ -60,14 +58,13 @@ on lines above
8
9 a
9a
A
1040NR
Form
Department of the Treasury
beginning
Internal Revenue Service
@ -92,8 +89,8 @@ Country 䊳
Of what country were you a citizen or national during the tax year? 䊳
Give address outside the United States to which you want any
Give address in the country where you are a permanent resident.
Give address outside the United States to which you want any
refund check mailed. If same as above, write “Same.”
If same as above, write “Same.”
@ -103,12 +100,12 @@ If same as above, write “Same.”
.epyt ro tnirp esaelP
Please print or type.
.dlehhtiw saw xat fi R-9901 )s(mroF hcatta oslA
.ereh 2-W smroF hcattA
Also attach Form(s) 1099-R if tax was withheld.
Attach Forms W-2 here.
@ -128,6 +125,10 @@ child for child tax
credit (see page 9)
Filing Status and Exemptions for Individuals (see page 8)
@ -160,14 +161,10 @@ Qualifying widow(er) with dependent child (see page 9)
Caution: Do not check box 7a if your parent (or someone else) can claim you as a dependent.
Do not check box 7b if your spouse had any U.S. gross income.
7 c
7c
Dependents: (see page 9)
(1) First name
@ -212,25 +209,22 @@ identifying number
10a
1 1
1 2
1 3
1 4
1 5
11
12
13
14
15
16b
17b
18
19
20
21
1 8
1 9
2 0
2 1
23
2 3
3 4
3 5
34
35
d
@ -245,23 +239,7 @@ d
9 b
8
9 a
b
10a
b
9b
Total number of exemptions claimed
@ -273,22 +251,21 @@ Tax-exempt interest. Do not include on line 9a
Ordinary dividends
10b
Qualified dividends (see page 11)
1 1
11
Taxable refunds, credits, or offsets of state and local income taxes (see page 11)
1 2
12
Scholarship and fellowship grants. Attach Form(s) 1042-S or required statement (see page 11)
1 3
13
Business income or (loss). Attach Schedule C or C-EZ (Form 1040)
1 4
14
Capital gain or (loss). Attach Schedule D (Form 1040) if required. If not required, check here
1 5
15
Other gains or (losses). Attach Form 4797
16a
16b
@ -296,8 +273,6 @@ Other gains or (losses). Attach Form 4797
Taxable amount (see page 12)
IRA distributions
17a
@ -308,77 +283,88 @@ Pensions and annuities
1 8
18
Rental real estate, royalties, partnerships, trusts, etc. Attach Schedule E (Form 1040)
1 9
19
Farm income or (loss). Attach Schedule F (Form 1040)
2 0
20
Unemployment compensation
2 1
21
Other income. List type and amount (see page 15)
2 2
2 2
22
22
Total income exempt by a treaty from page 5, Item M
2 3
Add lines 8, 9a, 10a, 1115, 16b, and 17b21. This is your total effectively connected income
23
Add lines 8, 9a, 10a, 1115, 16b, and 17b21. This is your total effectively connected income
2 4
2 4
24
24
Educator expenses (see page 15)
2 5
2 5
25
25
Health savings account deduction. Attach Form 8889
2 6
2 6
26
26
Moving expenses. Attach Form 3903
2 7
27
2 7
27
Self-employed SEP, SIMPLE, and qualified plans
2 8
28
2 8
28
Self-employed health insurance deduction (see page 16)
2 9
2 9
29
29
Penalty on early withdrawal of savings
3 0
30
3 0
30
Scholarship and fellowship grants excluded
3 1
31
IRA deduction (see page 16)
3 1
3 2
31
32
3 2
32
Student loan interest deduction (see page 16)
3 3
33
3 3
33
Domestic production activities deduction. Attach Form 8903
3 4
34
Add lines 24 through 33
3 5
Subtract line 34 from line 23. Enter here and on line 36. This is your adjusted gross income
35
Subtract line 34 from line 23. Enter here and on line 36. This is your adjusted gross income 䊳
For Disclosure, Privacy Act, and Paperwork Reduction Act Notice, see page 32.
Cat. No. 11364D
ssenisuB/edarT .S.U htiW detcennoC ylevitceffE emocnI
Income Effectively Connected With U.S. Trade/Business
Enclose, but do not attach, any payment.
8
9a
b
10a
b
Adjusted Gross Income
.tnemyap yna ,hcatta ton od tub ,esolcnE
emocnI ssorG detsujdA

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,19 +1,11 @@
Leadpct: 0% Pt. size: 9.5 ❏ Draft
Userid: ________ DTD INSTR04
PAGER/SGML
Page 1 of 48
________
Leadpct: 0%
DTD INSTR04
Pt. size: 9.5 ❏ Draft
Userid:
(Init. & date)
Fileid:
D:\USERS\8fllb\documents\epicfiles\2007Instructions1040NR.sgm
Page 1 of 48 Instructions for Form 1040NR
7:48 - 6-DEC-2007
Instructions for Form 1040NR
❏ Ok to Print
@ -27,6 +19,122 @@ U.S. Nonresident Alien Income Tax Return
Department of the Treasury
Internal Revenue Service
use a different address this year. See
Section references are to the Internal
Where To File on page 4.
Revenue Code unless otherwise noted.
General Instructions deduction. The deduction rate for
Domestic production activities
2007 is increased to 6%.
Whats New for 2007
Unreported social security and
Medicare tax on wages.
If you are
Tax benefits extended. The following
an employee and your employer did not
tax benefits were extended through
withhold social security and Medicare
2007.
• Deduction for educator expenses in
tax, see Form 8919 to figure and report
this tax.
figuring adjusted gross income.
• District of Columbia first-time
Refundable credit for prior-year
minimum tax.
If you have an unused
homebuyer credit.
minimum tax credit carryforward from
Alternative minimum tax (AMT)
2004, see Form 8801 to find if you can
exemption amount decreased. The
take this credit.
AMT exemption amount is decreased to
Health savings account (HSA)
$33,750 ($45,000 if a qualifying
funding distributions. You may be
widow(er); $22,500 if married filing
able to elect to exclude from income a
separately).
distribution made from your IRA to your
At the time these instructions
HSA. See the instructions for lines 16a
!
went to print, Congress was
and 16b beginning on page 12.
considering legislation that
CAUTION
New recordkeeping requirements for
would increase the amounts above. To
contributions of money.
For
find out if this legislation was enacted,
charitable contributions of money,
and for more details, see the
regardless of the amount, you must
Instructions for Form 6251.
maintain as a record of the contribution
IRA deduction expanded.
If you were
a bank record (such as a cancelled
covered by a retirement plan, you may
check) or a written record from the
be able to take an IRA deduction if your
charity. The written record must include
2007 modified adjusted gross income
the name of the charity, date, and
(AGI) is less than $62,000 ($103,000 if
amount of the contribution. See Gifts to
a qualifying widow(er)).
U.S. Charities that begins on page 26.
You may be able to deduct up to an
Exemption for housing a person
additional $3,000 if you were a
displaced by Hurricane Katrina
participant in a 401(k) plan and your
expires. The additional exemption
employer was in bankruptcy in an
amount for housing a person displaced
earlier year.
by Hurricane Katrina does not apply for
2007 or later years.
Standard mileage rates. The 2007
Telephone excise tax credit.
rate for business use of your vehicle is
This
481/2 cents a mile. The 2007 rate for
credit was available only on your 2006
use of your vehicle to move is 20 cents
return. If you filed but did not request it
a mile. The special rate for charitable
on your 2006 return, file Form 1040X
use of your vehicle to provide relief
using a simplified procedure explained
related to Hurricane Katrina has
in its instructions to amend your 2006
expired.
return. If you were not required to file a
2006 return, see the 2006 Form
Elective salary deferrals. The
1040EZ-T.
maximum amount you can defer under
all plans is generally limited to $15,500
Whats New for 2008
($10,500 if you only have SIMPLE
plans; $18,500 for section 403(b) plans
IRA deduction expanded. You may
if you qualify for the 15-year rule). See
be able to deduct up to $5,000 ($6,000
the instructions for line 8 on page 10.
if age 50 or older at the end of the
Mailing your return.
If you are filing
year). You may be able to take an IRA
deduction if you were covered by a
the return for an estate or trust, you will
Cat. No. 11368V
retirement plan and your 2008 modified
AGI is less than $63,000 ($105,000) if a
qualifying widow(er)).
@ -71,8 +179,7 @@ investment income on a parents return
and the special rule for when a child
must file Form 6251 will also apply to
the children listed above.
Expiring tax benefits.
The following
Expiring tax benefits. The following
benefits are scheduled to expire and
will not apply for 2008.
• Deduction for educator expenses in
@ -85,130 +192,4 @@ property.
homebuyer credit (for homes
purchased after 2007).
Section references are to the Internal
Revenue Code unless otherwise noted.
General Instructions
Whats New for 2007
Tax benefits extended.
The following
tax benefits were extended through
2007.
• Deduction for educator expenses in
figuring adjusted gross income.
• District of Columbia first-time
homebuyer credit.
Alternative minimum tax (AMT)
exemption amount decreased.
The
AMT exemption amount is decreased to
$33,750 ($45,000 if a qualifying
widow(er); $22,500 if married filing
separately).
At the time these instructions
!
went to print, Congress was
considering legislation that
CAUTION
would increase the amounts above. To
find out if this legislation was enacted,
and for more details, see the
Instructions for Form 6251.
IRA deduction expanded.
If you were
covered by a retirement plan, you may
be able to take an IRA deduction if your
2007 modified adjusted gross income
(AGI) is less than $62,000 ($103,000 if
a qualifying widow(er)).
You may be able to deduct up to an
additional $3,000 if you were a
participant in a 401(k) plan and your
employer was in bankruptcy in an
earlier year.
Standard mileage rates.
The 2007
rate for business use of your vehicle is
481/2 cents a mile. The 2007 rate for
use of your vehicle to move is 20 cents
a mile. The special rate for charitable
use of your vehicle to provide relief
related to Hurricane Katrina has
expired.
Elective salary deferrals.
The
maximum amount you can defer under
all plans is generally limited to $15,500
($10,500 if you only have SIMPLE
plans; $18,500 for section 403(b) plans
if you qualify for the 15-year rule). See
the instructions for line 8 on page 10.
Mailing your return.
If you are filing
the return for an estate or trust, you will
use a different address this year. See
Where To File on page 4.
Domestic production activities
deduction.
The deduction rate for
2007 is increased to 6%.
Unreported social security and
Medicare tax on wages.
If you are
an employee and your employer did not
withhold social security and Medicare
tax, see Form 8919 to figure and report
this tax.
Refundable credit for prior-year
minimum tax.
If you have an unused
minimum tax credit carryforward from
2004, see Form 8801 to find if you can
take this credit.
Health savings account (HSA)
funding distributions.
You may be
able to elect to exclude from income a
distribution made from your IRA to your
HSA. See the instructions for lines 16a
and 16b beginning on page 12.
New recordkeeping requirements for
contributions of money.
For
charitable contributions of money,
regardless of the amount, you must
maintain as a record of the contribution
a bank record (such as a cancelled
check) or a written record from the
charity. The written record must include
the name of the charity, date, and
amount of the contribution. See Gifts to
U.S. Charities that begins on page 26.
Exemption for housing a person
displaced by Hurricane Katrina
expires.
The additional exemption
amount for housing a person displaced
by Hurricane Katrina does not apply for
2007 or later years.
Telephone excise tax credit.
This
credit was available only on your 2006
return. If you filed but did not request it
on your 2006 return, file Form 1040X
using a simplified procedure explained
in its instructions to amend your 2006
return. If you were not required to file a
2006 return, see the 2006 Form
1040EZ-T.
Whats New for 2008
IRA deduction expanded.
You may
be able to deduct up to $5,000 ($6,000
if age 50 or older at the end of the
year). You may be able to take an IRA
deduction if you were covered by a
Cat. No. 11368V

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +1,12 @@
第 ›Ÿ˜ž 号
平成 ™— 年 月 ™œ 日 金曜日
平成 ™— 年 月 ™œ 日 金曜日
第 ›Ÿ˜ž 号
政令第百四十九号
道路交通法施行令の一部を改正する政令

View File

@ -1,57 +1,22 @@
<?xml version="1.0" encoding="utf-8" ?>
<pages>
<page id="1" bbox="0.000,0.000,595.000,842.000" rotate="0">
<textbox id="0" bbox="392.500,787.804,457.500,800.144">
<textline bbox="392.500,787.804,457.500,800.144">
<text font="Ryumin-Light" bbox="392.500,787.804,402.500,800.144" size="12.340">第</text>
<text> </text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="405.000,789.064,415.000,798.234" size="10.000"></text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="415.000,789.064,425.000,798.234" size="10.000">Ÿ</text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="425.000,789.064,435.000,798.234" size="10.000">˜</text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="435.000,789.064,445.000,798.234" size="10.000">ž</text>
<text> </text>
<text font="Ryumin-Light" bbox="447.500,787.804,457.500,800.144" size="12.340">号</text>
<text>
</text>
</textline>
</textbox>
<textbox id="1" bbox="527.500,789.064,537.500,798.234">
<textline bbox="527.500,789.064,537.500,798.234">
<text font="GMALPM+DFHSMincho-W3G014" bbox="527.500,789.064,537.500,798.234" size="10.000"></text>
<text>
</text>
</textline>
</textbox>
<textbox id="2" bbox="267.500,787.572,279.500,802.380">
<textline bbox="267.500,787.572,279.500,802.380">
<text font="Ryumin-Light" bbox="267.500,787.572,279.500,802.380" size="14.808">官</text>
<text>
</text>
</textline>
</textbox>
<textbox id="3" bbox="315.500,787.572,327.500,802.380">
<textline bbox="315.500,787.572,327.500,802.380">
<text font="Ryumin-Light" bbox="315.500,787.572,327.500,802.380" size="14.808">報</text>
<text>
</text>
</textline>
</textbox>
<textbox id="4" bbox="87.500,787.804,242.500,800.144">
<textbox id="0" bbox="87.500,787.804,242.500,800.144">
<textline bbox="87.500,787.804,242.500,800.144">
<text font="Ryumin-Light" bbox="87.500,787.804,97.500,800.144" size="12.340">平</text>
<text font="Ryumin-Light" bbox="97.500,787.804,107.500,800.144" size="12.340">成</text>
<text> </text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="110.000,789.064,120.000,798.234" size="10.000">™</text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="120.000,789.064,130.000,798.234" size="10.000">—</text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="110.000,789.064,120.000,798.234" size="9.170">™</text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="120.000,789.064,130.000,798.234" size="9.170">—</text>
<text> </text>
<text font="Ryumin-Light" bbox="132.500,787.804,142.500,800.144" size="12.340">年</text>
<text> </text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="145.000,789.064,155.000,798.234" size="10.000"></text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="145.000,789.064,155.000,798.234" size="9.170"></text>
<text> </text>
<text font="Ryumin-Light" bbox="157.500,787.804,167.500,800.144" size="12.340">月</text>
<text> </text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="170.000,789.064,180.000,798.234" size="10.000">™</text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="180.000,789.064,190.000,798.234" size="10.000">œ</text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="170.000,789.064,180.000,798.234" size="9.170">™</text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="180.000,789.064,190.000,798.234" size="9.170">œ</text>
<text> </text>
<text font="Ryumin-Light" bbox="192.500,787.804,202.500,800.144" size="12.340">日</text>
<text> </text>
@ -62,6 +27,41 @@
</text>
</textline>
</textbox>
<textbox id="1" bbox="267.500,787.572,279.500,802.380">
<textline bbox="267.500,787.572,279.500,802.380">
<text font="Ryumin-Light" bbox="267.500,787.572,279.500,802.380" size="14.808">官</text>
<text>
</text>
</textline>
</textbox>
<textbox id="2" bbox="315.500,787.572,327.500,802.380">
<textline bbox="315.500,787.572,327.500,802.380">
<text font="Ryumin-Light" bbox="315.500,787.572,327.500,802.380" size="14.808">報</text>
<text>
</text>
</textline>
</textbox>
<textbox id="3" bbox="392.500,787.804,457.500,800.144">
<textline bbox="392.500,787.804,457.500,800.144">
<text font="Ryumin-Light" bbox="392.500,787.804,402.500,800.144" size="12.340">第</text>
<text> </text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="405.000,789.064,415.000,798.234" size="9.170"></text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="415.000,789.064,425.000,798.234" size="9.170">Ÿ</text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="425.000,789.064,435.000,798.234" size="9.170">˜</text>
<text font="GMALPM+DFHSMincho-W3G014" bbox="435.000,789.064,445.000,798.234" size="9.170">ž</text>
<text> </text>
<text font="Ryumin-Light" bbox="447.500,787.804,457.500,800.144" size="12.340">号</text>
<text>
</text>
</textline>
</textbox>
<textbox id="4" bbox="527.500,789.064,537.500,798.234">
<textline bbox="527.500,789.064,537.500,798.234">
<text font="GMALPM+DFHSMincho-W3G014" bbox="527.500,789.064,537.500,798.234" size="9.170"></text>
<text>
</text>
</textline>
</textbox>
<textbox id="5" bbox="420.474,420.036,548.988,781.908" wmode="vertical">
<textline bbox="540.988,715.796,548.988,781.492">
<text font="GothicBBB-Medium" bbox="540.988,771.796,548.988,781.492" size="9.696">政</text>
@ -3136,16 +3136,16 @@
<layout>
<textgroup bbox="45.212,52.036,549.776,802.380">
<textgroup bbox="87.500,787.572,537.500,802.380">
<textgroup bbox="392.500,787.804,537.500,800.144">
<textbox id="0" bbox="392.500,787.804,457.500,800.144" />
<textbox id="1" bbox="527.500,789.064,537.500,798.234" />
</textgroup>
<textgroup bbox="87.500,787.572,327.500,802.380">
<textbox id="0" bbox="87.500,787.804,242.500,800.144" />
<textgroup bbox="267.500,787.572,327.500,802.380">
<textbox id="2" bbox="267.500,787.572,279.500,802.380" />
<textbox id="3" bbox="315.500,787.572,327.500,802.380" />
<textbox id="1" bbox="267.500,787.572,279.500,802.380" />
<textbox id="2" bbox="315.500,787.572,327.500,802.380" />
</textgroup>
<textbox id="4" bbox="87.500,787.804,242.500,800.144" />
</textgroup>
<textgroup bbox="392.500,787.804,537.500,800.144">
<textbox id="3" bbox="392.500,787.804,457.500,800.144" />
<textbox id="4" bbox="527.500,789.064,537.500,798.234" />
</textgroup>
</textgroup>
<textgroup bbox="45.212,52.036,549.776,781.960">

File diff suppressed because it is too large Load Diff

View File

@ -28,46 +28,37 @@ good results.
1 Background
Every day, a large number of news articles are cre-
ated and reported, many of which are unique.
But
ated and reported, many of which are unique. But
certain types of events, such as hurricanes or mur-
ders, are reported again and again throughout a year.
The goal of Information Extraction, or IE, is to re-
trieve a certain type of news event from past articles
and present the events as a table whose columns are
filled with a name of a person or company, accord-
However, existing IE
ing to its role in the event.
techniques require a lot of human labor.
First, you
ing to its role in the event. However, existing IE
techniques require a lot of human labor. First, you
have to specify the type of information you want and
collect articles that include this information.
Then,
collect articles that include this information. Then,
you have to analyze the articles and manually craft
a set of patterns to capture these events. Most exist-
ing IE research focuses on reducing this burden by
helping people create such patterns.
But each time
helping people create such patterns. But each time
you want to extract a different kind of information,
you need to repeat the whole process: specify arti-
cles and adjust its patterns, either manually or semi-
automatically.
There is a bit of a dangerous pitfall
automatically. There is a bit of a dangerous pitfall
here. First, it is hard to estimate how good the sys-
tem can be after months of work. Furthermore, you
might not know if the task is even doable in the first
place.
Knowing what kind of information is easily
place. Knowing what kind of information is easily
obtained in advance would help reduce this risk.
An IE task can be defined as finding a relation
among several entities involved in a certain type of
event.
For example, in the MUC-6 management
event. For example, in the MUC-6 management
succession scenario, one seeks a relation between
COMPANY, PERSON and POST involved with hir-
ing/firing events.
For each row of an extracted ta-
ing/firing events. For each row of an extracted ta-
ble, you can always read it as “COMPANY hired
(or fired) PERSON for POST.” The relation between
these entities is retained throughout the table. There

View File

@ -871,7 +871,7 @@
<text>
</text>
</textline>
<textline bbox="72.000,336.427,276.805,349.577">
<textline bbox="72.000,336.427,298.882,349.577">
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="72.000,336.427,76.845,349.577" size="13.150">a</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="76.845,336.427,79.879,349.577" size="13.150">t</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="79.879,336.427,84.725,349.577" size="13.150">e</text>
@ -916,10 +916,7 @@
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="263.775,336.427,269.232,349.577" size="13.150">u</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="269.232,336.427,274.077,349.577" size="13.150">e</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="274.077,336.427,276.805,349.577" size="13.150">.</text>
<text>
</text>
</textline>
<textline bbox="283.113,336.427,298.882,349.577">
<text> </text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="283.113,336.427,290.392,349.577" size="13.150">B</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="290.392,336.427,295.849,349.577" size="13.150">u</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="295.849,336.427,298.882,349.577" size="13.150">t</text>
@ -1260,31 +1257,7 @@
<text>
</text>
</textline>
<textline bbox="203.557,241.580,298.882,254.730">
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="203.557,241.580,211.436,254.730" size="13.150">H</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="211.436,241.580,216.893,254.730" size="13.150">o</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="216.631,241.580,224.510,254.730" size="13.150">w</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="224.510,241.580,229.355,254.730" size="13.150">e</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="229.083,241.580,234.539,254.730" size="13.150">v</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="234.375,241.580,239.221,254.730" size="13.150">e</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="239.221,241.580,242.855,254.730" size="13.150">r</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="242.418,241.580,245.147,254.730" size="13.150">,</text>
<text> </text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="249.861,241.580,254.706,254.730" size="13.150">e</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="254.554,241.580,260.010,254.730" size="13.150">x</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="260.010,241.580,263.044,254.730" size="13.150">i</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="263.044,241.580,267.289,254.730" size="13.150">s</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="267.289,241.580,270.323,254.730" size="13.150">t</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="270.323,241.580,273.357,254.730" size="13.150">i</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="273.357,241.580,278.813,254.730" size="13.150">n</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="278.813,241.580,284.270,254.730" size="13.150">g</text>
<text> </text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="288.580,241.580,292.215,254.730" size="13.150">I</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="292.215,241.580,298.882,254.730" size="13.150">E</text>
<text>
</text>
</textline>
<textline bbox="72.000,241.580,195.405,254.730">
<textline bbox="72.000,241.580,298.882,254.730">
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="72.000,241.580,75.034,254.730" size="13.150">i</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="75.034,241.580,80.490,254.730" size="13.150">n</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="80.490,241.580,85.947,254.730" size="13.150">g</text>
@ -1314,10 +1287,31 @@
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="184.186,241.580,189.643,254.730" size="13.150">n</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="189.643,241.580,192.677,254.730" size="13.150">t</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="192.677,241.580,195.405,254.730" size="13.150">.</text>
<text> </text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="203.557,241.580,211.436,254.730" size="13.150">H</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="211.436,241.580,216.893,254.730" size="13.150">o</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="216.631,241.580,224.510,254.730" size="13.150">w</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="224.510,241.580,229.355,254.730" size="13.150">e</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="229.083,241.580,234.539,254.730" size="13.150">v</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="234.375,241.580,239.221,254.730" size="13.150">e</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="239.221,241.580,242.855,254.730" size="13.150">r</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="242.418,241.580,245.147,254.730" size="13.150">,</text>
<text> </text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="249.861,241.580,254.706,254.730" size="13.150">e</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="254.554,241.580,260.010,254.730" size="13.150">x</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="260.010,241.580,263.044,254.730" size="13.150">i</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="263.044,241.580,267.289,254.730" size="13.150">s</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="267.289,241.580,270.323,254.730" size="13.150">t</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="270.323,241.580,273.357,254.730" size="13.150">i</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="273.357,241.580,278.813,254.730" size="13.150">n</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="278.813,241.580,284.270,254.730" size="13.150">g</text>
<text> </text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="288.580,241.580,292.215,254.730" size="13.150">I</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="292.215,241.580,298.882,254.730" size="13.150">E</text>
<text>
</text>
</textline>
<textline bbox="72.000,228.026,250.352,241.177">
<textline bbox="72.000,228.026,298.882,241.177">
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="72.000,228.026,75.034,241.177" size="13.150">t</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="75.034,228.026,79.879,241.177" size="13.150">e</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="79.879,228.026,84.725,241.177" size="13.150">c</text>
@ -1358,10 +1352,7 @@
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="239.123,228.026,244.579,241.177" size="13.150">o</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="244.579,228.026,248.213,241.177" size="13.150">r</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="247.624,228.026,250.352,241.177" size="13.150">.</text>
<text>
</text>
</textline>
<textline bbox="256.060,228.026,298.882,241.177">
<text> </text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="256.060,228.026,262.127,241.177" size="13.150">F</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="262.127,228.026,265.161,241.177" size="13.150">i</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="265.161,228.026,268.795,241.177" size="13.150">r</text>
@ -1431,7 +1422,7 @@
<text>
</text>
</textline>
<textline bbox="72.000,200.929,268.249,214.079">
<textline bbox="72.000,200.929,298.882,214.079">
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="72.000,200.929,76.845,214.079" size="13.150">c</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="76.845,200.929,82.302,214.079" size="13.150">o</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="82.302,200.929,85.336,214.079" size="13.150">l</text>
@ -1479,10 +1470,7 @@
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="254.608,200.929,260.065,214.079" size="13.150">o</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="260.065,200.929,265.521,214.079" size="13.150">n</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="265.521,200.929,268.249,214.079" size="13.150">.</text>
<text>
</text>
</textline>
<textline bbox="273.728,200.929,298.882,214.079">
<text> </text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="273.728,200.929,280.396,214.079" size="13.150">T</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="280.396,200.929,285.852,214.079" size="13.150">h</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="285.852,200.929,290.698,214.079" size="13.150">e</text>
@ -1658,7 +1646,7 @@
<text>
</text>
</textline>
<textline bbox="72.000,146.734,231.091,159.884">
<textline bbox="72.000,146.734,298.882,159.884">
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="72.000,146.734,77.457,159.884" size="13.150">h</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="77.457,146.734,82.302,159.884" size="13.150">e</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="82.302,146.734,85.336,159.884" size="13.150">l</text>
@ -1695,10 +1683,7 @@
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="218.661,146.734,224.117,159.884" size="13.150">n</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="224.117,146.734,228.362,159.884" size="13.150">s</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="228.362,146.734,231.091,159.884" size="13.150">.</text>
<text>
</text>
</textline>
<textline bbox="236.743,146.734,298.882,159.884">
<text> </text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="236.743,146.734,244.022,159.884" size="13.150">B</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="244.022,146.734,249.479,159.884" size="13.150">u</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="249.479,146.734,252.513,159.884" size="13.150">t</text>
@ -1886,7 +1871,7 @@
<text>
</text>
</textline>
<textline bbox="313.198,581.125,374.638,594.275">
<textline bbox="313.198,581.125,540.091,594.275">
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="313.198,581.125,318.043,594.275" size="13.150">a</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="318.043,581.125,323.499,594.275" size="13.150">u</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="323.499,581.125,326.533,594.275" size="13.150">t</text>
@ -1901,10 +1886,7 @@
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="364.118,581.125,367.152,594.275" size="13.150">l</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="367.152,581.125,372.608,594.275" size="13.150">y</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="371.910,581.125,374.638,594.275" size="13.150">.</text>
<text>
</text>
</textline>
<textline bbox="380.749,581.125,540.091,594.275">
<text> </text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="380.749,581.125,387.417,594.275" size="13.150">T</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="387.417,581.125,392.874,594.275" size="13.150">h</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="392.874,581.125,397.719,594.275" size="13.150">e</text>
@ -2112,17 +2094,14 @@
<text>
</text>
</textline>
<textline bbox="313.198,526.930,338.952,540.080">
<textline bbox="313.198,526.930,540.080,540.080">
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="313.198,526.930,318.654,540.080" size="13.150">p</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="318.654,526.930,321.688,540.080" size="13.150">l</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="321.688,526.930,326.533,540.080" size="13.150">a</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="326.533,526.930,331.379,540.080" size="13.150">c</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="331.379,526.930,336.224,540.080" size="13.150">e</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="336.224,526.930,338.952,540.080" size="13.150">.</text>
<text>
</text>
</textline>
<textline bbox="344.682,526.930,540.080,540.080">
<text> </text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="344.682,526.930,352.561,540.080" size="13.150">K</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="352.561,526.930,358.017,540.080" size="13.150">n</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="358.017,526.930,363.474,540.080" size="13.150">o</text>
@ -2325,17 +2304,14 @@
<text>
</text>
</textline>
<textline bbox="313.198,472.325,339.138,485.475">
<textline bbox="313.198,472.325,540.080,485.475">
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="313.198,472.325,318.043,485.475" size="13.150">e</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="317.781,472.325,323.238,485.475" size="13.150">v</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="323.074,472.325,327.919,485.475" size="13.150">e</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="327.919,472.325,333.376,485.475" size="13.150">n</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="333.376,472.325,336.410,485.475" size="13.150">t</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="336.410,472.325,339.138,485.475" size="13.150">.</text>
<text>
</text>
</textline>
<textline bbox="349.003,472.325,540.080,485.475">
<text> </text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="349.003,472.325,355.071,485.475" size="13.150">F</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="354.918,472.325,360.375,485.475" size="13.150">o</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="360.375,472.325,364.009,485.475" size="13.150">r</text>
@ -2475,7 +2451,7 @@
<text>
</text>
</textline>
<textline bbox="313.198,431.673,387.854,444.824">
<textline bbox="313.198,431.673,540.080,444.824">
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="313.198,431.673,316.231,444.824" size="13.150">i</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="316.231,431.673,321.688,444.824" size="13.150">n</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="321.688,431.673,327.144,444.824" size="13.150">g</text>
@ -2493,10 +2469,7 @@
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="377.846,431.673,380.880,444.824" size="13.150">t</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="380.880,431.673,385.125,444.824" size="13.150">s</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="385.125,431.673,387.854,444.824" size="13.150">.</text>
<text>
</text>
</textline>
<textline bbox="394.576,431.673,540.080,444.824">
<text> </text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="394.576,431.673,400.644,444.824" size="13.150">F</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="400.491,431.673,405.948,444.824" size="13.150">o</text>
<text font="QTLIUY+NimbusRomNo9L-Regu" bbox="405.948,431.673,409.582,444.824" size="13.150">r</text>

View File

@ -3,92 +3,13 @@
</head><body>
<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:800px; height:600px;"></span>
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
<span style="position:absolute; border: cyan 1px solid; left:62px; top:126px; width:672px; height:157px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:62px; top:126px; width:672px; height:85px;"></span>
<span style="position:absolute; left:62px; top:126px; font-size:85px;">コ</span>
<span style="position:absolute; left:110px; top:126px; font-size:85px;">ン</span>
<span style="position:absolute; left:158px; top:126px; font-size:85px;">パ</span>
<span style="position:absolute; left:206px; top:126px; font-size:85px;">ラ</span>
<span style="position:absolute; left:254px; top:126px; font-size:85px;">ブ</span>
<span style="position:absolute; left:302px; top:126px; font-size:85px;">ル</span>
<span style="position:absolute; left:350px; top:126px; font-size:85px;">な</span>
<span style="position:absolute; left:398px; top:126px; font-size:85px;">新</span>
<span style="position:absolute; left:446px; top:126px; font-size:85px;">聞</span>
<span style="position:absolute; left:494px; top:126px; font-size:85px;">記</span>
<span style="position:absolute; left:542px; top:126px; font-size:85px;">事</span>
<span style="position:absolute; left:590px; top:126px; font-size:85px;">か</span>
<span style="position:absolute; left:638px; top:126px; font-size:85px;">ら</span>
<span style="position:absolute; left:686px; top:126px; font-size:85px;">の</span>
<span style="position:absolute; border: magenta 1px solid; left:62px; top:198px; width:336px; height:85px;"></span>
<span style="position:absolute; left:62px; top:198px; font-size:85px;">固</span>
<span style="position:absolute; left:110px; top:198px; font-size:85px;">有</span>
<span style="position:absolute; left:158px; top:198px; font-size:85px;">表</span>
<span style="position:absolute; left:206px; top:198px; font-size:85px;">現</span>
<span style="position:absolute; left:254px; top:198px; font-size:85px;">の</span>
<span style="position:absolute; left:302px; top:198px; font-size:85px;">発</span>
<span style="position:absolute; left:350px; top:198px; font-size:85px;">見</span>
<span style="position:absolute; border: cyan 1px solid; left:263px; top:374px; width:468px; height:212px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:576px; top:374px; width:155px; height:64px;"></span>
<span style="position:absolute; left:576px; top:374px; font-size:64px;">新</span>
<span style="position:absolute; left:612px; top:374px; font-size:64px;">山</span>
<span style="position:absolute; left:648px; top:374px; font-size:64px;"> </span>
<span style="position:absolute; left:660px; top:374px; font-size:64px;">祐</span>
<span style="position:absolute; left:696px; top:374px; font-size:64px;">介</span>
<span style="position:absolute; border: magenta 1px solid; left:612px; top:430px; width:119px; height:64px;"></span>
<span style="position:absolute; left:612px; top:430px; font-size:64px;">関</span>
<span style="position:absolute; left:648px; top:430px; font-size:64px;">根</span>
<span style="position:absolute; left:684px; top:430px; font-size:64px;"> </span>
<span style="position:absolute; left:696px; top:430px; font-size:64px;">聡</span>
<span style="position:absolute; border: magenta 1px solid; left:263px; top:493px; width:468px; height:50px;"></span>
<span style="position:absolute; left:263px; top:493px; font-size:50px;">C</span>
<span style="position:absolute; left:285px; top:493px; font-size:50px;">o</span>
<span style="position:absolute; left:304px; top:493px; font-size:50px;">m</span>
<span style="position:absolute; left:332px; top:493px; font-size:50px;">p</span>
<span style="position:absolute; left:352px; top:493px; font-size:50px;">u</span>
<span style="position:absolute; left:371px; top:493px; font-size:50px;">t</span>
<span style="position:absolute; left:383px; top:493px; font-size:50px;">e</span>
<span style="position:absolute; left:401px; top:493px; font-size:50px;">r</span>
<span style="position:absolute; left:415px; top:493px; font-size:50px;"> </span>
<span style="position:absolute; left:424px; top:493px; font-size:50px;">S</span>
<span style="position:absolute; left:444px; top:493px; font-size:50px;">c</span>
<span style="position:absolute; left:462px; top:493px; font-size:50px;">i</span>
<span style="position:absolute; left:469px; top:493px; font-size:50px;">e</span>
<span style="position:absolute; left:487px; top:493px; font-size:50px;">n</span>
<span style="position:absolute; left:506px; top:493px; font-size:50px;">c</span>
<span style="position:absolute; left:523px; top:493px; font-size:50px;">e</span>
<span style="position:absolute; left:541px; top:493px; font-size:50px;"> </span>
<span style="position:absolute; left:550px; top:493px; font-size:50px;">D</span>
<span style="position:absolute; left:573px; top:493px; font-size:50px;">e</span>
<span style="position:absolute; left:591px; top:493px; font-size:50px;">p</span>
<span style="position:absolute; left:611px; top:493px; font-size:50px;">a</span>
<span style="position:absolute; left:628px; top:493px; font-size:50px;">r</span>
<span style="position:absolute; left:642px; top:493px; font-size:50px;">t</span>
<span style="position:absolute; left:654px; top:493px; font-size:50px;">m</span>
<span style="position:absolute; left:683px; top:493px; font-size:50px;">e</span>
<span style="position:absolute; left:701px; top:493px; font-size:50px;">n</span>
<span style="position:absolute; left:719px; top:493px; font-size:50px;">t</span>
<span style="position:absolute; border: magenta 1px solid; left:424px; top:537px; width:307px; height:50px;"></span>
<span style="position:absolute; left:424px; top:537px; font-size:50px;">N</span>
<span style="position:absolute; left:447px; top:537px; font-size:50px;">e</span>
<span style="position:absolute; left:464px; top:537px; font-size:50px;">w</span>
<span style="position:absolute; left:488px; top:537px; font-size:50px;"> </span>
<span style="position:absolute; left:498px; top:537px; font-size:50px;">Y</span>
<span style="position:absolute; left:519px; top:537px; font-size:50px;">o</span>
<span style="position:absolute; left:538px; top:537px; font-size:50px;">r</span>
<span style="position:absolute; left:551px; top:537px; font-size:50px;">k</span>
<span style="position:absolute; left:570px; top:537px; font-size:50px;"> </span>
<span style="position:absolute; left:579px; top:537px; font-size:50px;">U</span>
<span style="position:absolute; left:602px; top:537px; font-size:50px;">n</span>
<span style="position:absolute; left:621px; top:537px; font-size:50px;">i</span>
<span style="position:absolute; left:629px; top:537px; font-size:50px;">v</span>
<span style="position:absolute; left:646px; top:537px; font-size:50px;">e</span>
<span style="position:absolute; left:664px; top:537px; font-size:50px;">r</span>
<span style="position:absolute; left:678px; top:537px; font-size:50px;">s</span>
<span style="position:absolute; left:694px; top:537px; font-size:50px;">i</span>
<span style="position:absolute; left:702px; top:537px; font-size:50px;">t</span>
<span style="position:absolute; left:714px; top:537px; font-size:50px;">y</span>
<span style="position:absolute; border: black 1px solid; left:0px; top:50px; width:800px; height:600px;"></span>
<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:62px; top:126px; width:672px; height:157px;"><span style="font-family: DAFPJF+HiraKakuPro-W6; font-size:60px">コンパラブルな新聞記事からの
<br>固有表現の発見
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:263px; top:374px; width:468px; height:212px;"><span style="font-family: DAFPJF+HiraKakuPro-W6; font-size:45px">新山 祐介
<br>関根 聡
<br></span><span style="font-family: DAFPJF+HiraKakuPro-W6; font-size:35px">Computer Science Department
<br>New York University
<br></span></div><span style="position:absolute; border: black 1px solid; left:0px; top:50px; width:800px; height:600px;"></span>
<span style="position:absolute; border: black 1px solid; left:50px; top:308px; width:510px; height:0px;"></span>
<span style="position:absolute; border: yellow 1px solid; left:25px; top:587px; width:41px; height:40px;"></span>
<div style="position:absolute; top:0px;">Page: <a href="#1">1</a></div>
</body></html>

View File

@ -3,65 +3,13 @@
</head><body>
<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
<span style="position:absolute; border: cyan 1px solid; left:100px; top:119px; width:61px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:100px; top:119px; width:61px; height:27px;"></span>
<span style="position:absolute; left:100px; top:119px; font-size:27px;">H</span>
<span style="position:absolute; left:117px; top:119px; font-size:27px;">e</span>
<span style="position:absolute; left:130px; top:119px; font-size:27px;">l</span>
<span style="position:absolute; left:136px; top:119px; font-size:27px;">l</span>
<span style="position:absolute; left:141px; top:119px; font-size:27px;">o</span>
<span style="position:absolute; left:154px; top:119px; font-size:27px;"> </span>
<span style="position:absolute; border: cyan 1px solid; left:261px; top:119px; width:62px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:261px; top:119px; width:62px; height:27px;"></span>
<span style="position:absolute; left:261px; top:119px; font-size:27px;">W</span>
<span style="position:absolute; left:283px; top:119px; font-size:27px;">o</span>
<span style="position:absolute; left:297px; top:119px; font-size:27px;">r</span>
<span style="position:absolute; left:305px; top:119px; font-size:27px;">l</span>
<span style="position:absolute; left:310px; top:119px; font-size:27px;">d</span>
<span style="position:absolute; border: cyan 1px solid; left:100px; top:219px; width:61px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:100px; top:219px; width:61px; height:27px;"></span>
<span style="position:absolute; left:100px; top:219px; font-size:27px;">H</span>
<span style="position:absolute; left:117px; top:219px; font-size:27px;">e</span>
<span style="position:absolute; left:130px; top:219px; font-size:27px;">l</span>
<span style="position:absolute; left:136px; top:219px; font-size:27px;">l</span>
<span style="position:absolute; left:141px; top:219px; font-size:27px;">o</span>
<span style="position:absolute; left:154px; top:219px; font-size:27px;"> </span>
<span style="position:absolute; border: cyan 1px solid; left:261px; top:219px; width:62px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:261px; top:219px; width:62px; height:27px;"></span>
<span style="position:absolute; left:261px; top:219px; font-size:27px;">W</span>
<span style="position:absolute; left:284px; top:219px; font-size:27px;">o</span>
<span style="position:absolute; left:297px; top:219px; font-size:27px;">r</span>
<span style="position:absolute; left:305px; top:219px; font-size:27px;">l</span>
<span style="position:absolute; left:310px; top:219px; font-size:27px;">d</span>
<span style="position:absolute; border: cyan 1px solid; left:100px; top:319px; width:111px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:100px; top:319px; width:111px; height:27px;"></span>
<span style="position:absolute; left:100px; top:319px; font-size:27px;">H</span>
<span style="position:absolute; left:127px; top:319px; font-size:27px;">e</span>
<span style="position:absolute; left:150px; top:319px; font-size:27px;">l</span>
<span style="position:absolute; left:166px; top:319px; font-size:27px;">l</span>
<span style="position:absolute; left:181px; top:319px; font-size:27px;">o</span>
<span style="position:absolute; left:204px; top:319px; font-size:27px;"> </span>
<span style="position:absolute; border: cyan 1px solid; left:321px; top:319px; width:102px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:321px; top:319px; width:102px; height:27px;"></span>
<span style="position:absolute; left:321px; top:319px; font-size:27px;">W</span>
<span style="position:absolute; left:354px; top:319px; font-size:27px;">o</span>
<span style="position:absolute; left:377px; top:319px; font-size:27px;">r</span>
<span style="position:absolute; left:395px; top:319px; font-size:27px;">l</span>
<span style="position:absolute; left:410px; top:319px; font-size:27px;">d</span>
<span style="position:absolute; border: cyan 1px solid; left:100px; top:419px; width:111px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:100px; top:419px; width:111px; height:27px;"></span>
<span style="position:absolute; left:100px; top:419px; font-size:27px;">H</span>
<span style="position:absolute; left:127px; top:419px; font-size:27px;">e</span>
<span style="position:absolute; left:150px; top:419px; font-size:27px;">l</span>
<span style="position:absolute; left:165px; top:419px; font-size:27px;">l</span>
<span style="position:absolute; left:181px; top:419px; font-size:27px;">o</span>
<span style="position:absolute; left:204px; top:419px; font-size:27px;"> </span>
<span style="position:absolute; border: cyan 1px solid; left:321px; top:419px; width:102px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:321px; top:419px; width:102px; height:27px;"></span>
<span style="position:absolute; left:321px; top:419px; font-size:27px;">W</span>
<span style="position:absolute; left:353px; top:419px; font-size:27px;">o</span>
<span style="position:absolute; left:377px; top:419px; font-size:27px;">r</span>
<span style="position:absolute; left:395px; top:419px; font-size:27px;">l</span>
<span style="position:absolute; left:410px; top:419px; font-size:27px;">d</span>
<div style="position:absolute; top:0px;">Page: <a href="#1">1</a></div>
<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:100px; top:119px; width:61px; height:27px;"><span style="font-family: Helvetica; font-size:19px">Hello
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:261px; top:119px; width:62px; height:27px;"><span style="font-family: Helvetica; font-size:19px">World
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:100px; top:219px; width:61px; height:27px;"><span style="font-family: Helvetica; font-size:19px">Hello
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:261px; top:219px; width:62px; height:27px;"><span style="font-family: Helvetica; font-size:19px">World
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:100px; top:319px; width:111px; height:27px;"><span style="font-family: Helvetica; font-size:19px">H e l l o
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:321px; top:319px; width:102px; height:27px;"><span style="font-family: Helvetica; font-size:19px">W o r l d
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:100px; top:419px; width:111px; height:27px;"><span style="font-family: Helvetica; font-size:19px">H e l l o
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:321px; top:419px; width:102px; height:27px;"><span style="font-family: Helvetica; font-size:19px">W o r l d
<br></span></div><div style="position:absolute; top:0px;">Page: <a href="#1">1</a></div>
</body></html>

View File

@ -3,42 +3,9 @@
</head><body>
<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
<span style="position:absolute; border: cyan 1px solid; left:0px; top:72px; width:218px; height:79px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:0px; top:72px; width:218px; height:79px;"></span>
<span style="position:absolute; left:0px; top:96px; font-size:55px;">H</span>
<span style="position:absolute; left:34px; top:96px; font-size:55px;">e</span>
<span style="position:absolute; left:61px; top:96px; font-size:55px;">l</span>
<span style="position:absolute; left:72px; top:96px; font-size:55px;">l</span>
<span style="position:absolute; left:82px; top:96px; font-size:55px;">o</span>
<span style="position:absolute; left:109px; top:72px; font-size:55px;">H</span>
<span style="position:absolute; left:144px; top:72px; font-size:55px;">e</span>
<span style="position:absolute; left:170px; top:72px; font-size:55px;">l</span>
<span style="position:absolute; left:181px; top:72px; font-size:55px;">l</span>
<span style="position:absolute; left:192px; top:72px; font-size:55px;">o</span>
<span style="position:absolute; border: cyan 1px solid; left:194px; top:136px; width:48px; height:490px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:194px; top:136px; width:48px; height:490px;"></span>
<span style="position:absolute; left:194px; top:136px; font-size:48px;">あ</span>
<span style="position:absolute; left:194px; top:184px; font-size:48px;">い</span>
<span style="position:absolute; left:194px; top:232px; font-size:48px;">う</span>
<span style="position:absolute; left:194px; top:280px; font-size:48px;">え</span>
<span style="position:absolute; left:194px; top:328px; font-size:48px;">お</span>
<span style="position:absolute; left:194px; top:352px; font-size:48px;">あ</span>
<span style="position:absolute; left:194px; top:400px; font-size:48px;">い</span>
<span style="position:absolute; left:194px; top:448px; font-size:48px;">う</span>
<span style="position:absolute; left:194px; top:496px; font-size:48px;">え</span>
<span style="position:absolute; left:194px; top:544px; font-size:48px;">お</span>
<span style="position:absolute; left:218px; top:599px; font-size:27px;">W</span>
<span style="position:absolute; border: cyan 1px solid; left:241px; top:575px; width:102px; height:51px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:281px; top:575px; width:62px; height:27px;"></span>
<span style="position:absolute; left:281px; top:575px; font-size:27px;">W</span>
<span style="position:absolute; left:304px; top:575px; font-size:27px;">o</span>
<span style="position:absolute; left:317px; top:575px; font-size:27px;">r</span>
<span style="position:absolute; left:325px; top:575px; font-size:27px;">l</span>
<span style="position:absolute; left:330px; top:575px; font-size:27px;">d</span>
<span style="position:absolute; border: magenta 1px solid; left:241px; top:599px; width:40px; height:27px;"></span>
<span style="position:absolute; left:241px; top:599px; font-size:27px;">o</span>
<span style="position:absolute; left:254px; top:599px; font-size:27px;">r</span>
<span style="position:absolute; left:262px; top:599px; font-size:27px;">l</span>
<span style="position:absolute; left:268px; top:599px; font-size:27px;">d</span>
<div style="position:absolute; top:0px;">Page: <a href="#1">1</a></div>
<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:0px; top:72px; width:218px; height:79px;"><span style="font-family: Helvetica; font-size:38px">HelloHello
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:tb-rl; left:194px; top:136px; width:48px; height:490px;"><span style="font-family: unknown; font-size:33px">あいうえおあいうえお </span><span style="font-family: Helvetica; font-size:19px">W
<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:241px; top:575px; width:102px; height:51px;"><span style="font-family: Helvetica; font-size:19px">World
<br>orld
<br></span></div><div style="position:absolute; top:0px;">Page: <a href="#1">1</a></div>
</body></html>

View File

@ -1,4 +1,4 @@
#!/usr/bin/python2 -O
#!/usr/bin/python -O
#
# pdf2html.cgi - Gateway script for converting PDF into HTML.
#
@ -42,7 +42,7 @@ def url(base, **kw):
## convert
##
class FileSizeExceeded(ValueError): pass
def convert(outfp, infp, path, codec='utf-8',
def convert(infp, outfp, path, codec='utf-8',
maxpages=0, maxfilesize=0, pagenos=None,
html=True):
# save the input file.
@ -76,22 +76,22 @@ def convert(outfp, infp, path, codec='utf-8',
class WebApp(object):
TITLE = 'pdf2html demo'
APPPATH = '/' # absolute URL path to this application.
MAXFILESIZE = 5000000 # set to zero if unlimited.
MAXFILESIZE = 10000000 # set to zero if unlimited.
MAXPAGES = 10 # set to zero if unlimited.
def __init__(self, infp=sys.stdin, outfp=sys.stdout, codec='utf-8'):
def __init__(self, infp=sys.stdin, outfp=sys.stdout, environ=os.environ,
codec='utf-8', apppath='/'):
self.infp = infp
self.outfp = outfp
self.codec = codec
self.remote_addr = os.environ.get('REMOTE_ADDR')
self.path_info = os.environ.get('PATH_INFO')
self.method = os.environ.get('REQUEST_METHOD', 'GET').upper()
self.server = os.environ.get('SERVER_SOFTWARE', '')
self.logpath = os.environ.get('LOG_PATH', './var/log')
self.tmpdir = os.environ.get('TEMP', './var/')
self.apppath = apppath
self.remote_addr = environ.get('REMOTE_ADDR')
self.path_info = environ.get('PATH_INFO')
self.method = environ.get('REQUEST_METHOD', 'GET').upper()
self.server = environ.get('SERVER_SOFTWARE', '')
self.tmpdir = environ.get('TEMP', './var/')
self.content_type = 'text/html; charset=%s' % codec
self.cur_time = time.time()
self.form = cgi.FieldStorage(infp)
self.logger = logging.getLogger()
return
def put(self, *args):
@ -130,7 +130,7 @@ class WebApp(object):
self.put(
'<html><head><title>%s</title></head><body>\n' % q(self.TITLE),
'<h1>%s</h1><hr>\n' % q(self.TITLE),
'<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPPATH),
'<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.apppath),
'<p>Upload PDF File: <input name="f" type="file" value="">\n',
'&nbsp; Page numbers (comma-separated):\n',
'<input name="p" type="text" size="10" value="">\n',
@ -145,49 +145,54 @@ class WebApp(object):
)
return
def run(self, argv):
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s',
filename=self.logpath, filemode='a')
if self.path_info != self.APPPATH:
def setup(self):
if not os.path.isdir(self.tmpdir):
self.logger.error('no tmpdir')
status = 304
elif self.path_info != self.apppath:
status = 404
else:
status = 200
self._status = status
return status
def run(self):
form = cgi.FieldStorage(self.infp)
if self._status != 200:
self.http_404()
return
if not os.path.isdir(self.tmpdir):
logging.error('no tmpdir')
self.bummer('error')
return
if (self.method != 'POST' or
'c' not in self.form or
'f' not in self.form):
'c' not in form or
'f' not in form):
self.coverpage()
return
item = self.form['f']
item = form['f']
if not (item.file and item.filename):
self.coverpage()
return
cmd = self.form.getvalue('c')
cmd = form.getvalue('c')
html = (cmd == 'Convert to HTML')
pagenos = []
if 'p' in self.form:
for m in re.finditer(r'\d+', self.form.getvalue('p')):
if 'p' in form:
for m in re.finditer(r'\d+', form.getvalue('p')):
try:
pagenos.append(int(m.group(0)))
except ValueError:
pass
logging.info('received: host=%s, name=%r, pagenos=%r' %
self.logger.info('received: host=%s, name=%r, pagenos=%r' %
(self.remote_addr, item.filename, pagenos))
h = abs(hash((random.random(), self.remote_addr, item.filename)))
tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (self.cur_time, h))
tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (time.time(), h))
try:
if not html:
self.content_type = 'text/plain; charset=%s' % self.codec
self.http_200()
try:
convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec,
convert(item.file, sys.stdout, tmppath, pagenos=pagenos, codec=self.codec,
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
except Exception, e:
self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
logging.error('convert: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
self.logger.error('convert: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
finally:
try:
os.remove(tmppath)
@ -197,4 +202,7 @@ class WebApp(object):
# main
if __name__ == '__main__': sys.exit(WebApp().run(sys.argv))
if __name__ == '__main__':
app = WebApp()
app.setup()
sys.exit(app.run())