tesseract-004
Tests that Alto XML output works
Test is expected to pass.
The pipeline
<p:declare-step xmlns:cx="http://xmlcalabash.com/ns/extensions"
xmlns:p="http://www.w3.org/ns/xproc"
xmlns:t="http://xproc.org/ns/testsuite/3.0" name="main" version="3.0">
<p:import href="https://xmlcalabash.com/ext/library/pdf-steps.xpl"/>
<p:import href="https://xmlcalabash.com/ext/library/tesseract.xpl"/>
<p:output port="result"/>
<cx:pdf-to-images dpi="300">
<p:with-input port="source"
href="../documents/example.pdf"/>
</cx:pdf-to-images>
<cx:tesseract language="eng" output-format="alto"
debug-output="/dev/null"/>
</p:declare-step>
Result
<Page xmlns:t="http://xproc.org/ns/testsuite/3.0"
WIDTH="2480" HEIGHT="3507" PHYSICAL_IMG_NR="0" ID="page_0">
<PrintSpace HPOS="0" VPOS="0" WIDTH="2480" HEIGHT="3507">
<ComposedBlock ID="cblock_0" HPOS="200" VPOS="237"
WIDTH="386" HEIGHT="64">
<TextBlock ID="block_0" HPOS="200" VPOS="237"
WIDTH="386" HEIGHT="64">
<TextLine ID="line_0" HPOS="200" VPOS="237"
WIDTH="386" HEIGHT="64">
<String ID="string_0" HPOS="200" VPOS="237"
WIDTH="173" HEIGHT="63" WC="0.96" CONTENT="PDF"/>
<SP WIDTH="31" VPOS="237" HPOS="373"/>
<String ID="string_1" HPOS="404" VPOS="237"
WIDTH="182" HEIGHT="64" WC="0.96"
CONTENT="Text"/>
</TextLine>
</TextBlock>
</ComposedBlock>
<ComposedBlock ID="cblock_1" HPOS="191" VPOS="387"
WIDTH="651" HEIGHT="44">
<TextBlock ID="block_1" HPOS="191" VPOS="387"
WIDTH="651" HEIGHT="44">
<TextLine ID="line_1" HPOS="191" VPOS="387"
WIDTH="651" HEIGHT="44">
<String ID="string_2" HPOS="191" VPOS="387"
WIDTH="80" HEIGHT="34" WC="0.96"
CONTENT="This"/>
<SP WIDTH="18" VPOS="387" HPOS="271"/>
<String ID="string_3" HPOS="289" VPOS="389"
WIDTH="27" HEIGHT="32" WC="0.96" CONTENT="is"/>
<SP WIDTH="31" VPOS="389" HPOS="316"/>
<String ID="string_4" HPOS="347" VPOS="398"
WIDTH="5" HEIGHT="23" WC="0.96" CONTENT="a"/>
<SP WIDTH="17" VPOS="398" HPOS="352"/>
<String ID="string_5" HPOS="369" VPOS="387"
WIDTH="143" HEIGHT="44" WC="0.96"
CONTENT="sample"/>
<SP WIDTH="19" VPOS="387" HPOS="512"/>
<String ID="string_6" HPOS="531" VPOS="389"
WIDTH="82" HEIGHT="32" WC="0.96" CONTENT="PDF"/>
<SP WIDTH="16" VPOS="389" HPOS="613"/>
<String ID="string_7" HPOS="629" VPOS="387"
WIDTH="213" HEIGHT="34" WC="0.95"
CONTENT="document."/>
</TextLine>
</TextBlock>
</ComposedBlock>
<Illustration ID="cblock_2" HPOS="206" VPOS="578"
WIDTH="466" HEIGHT="478"/>
<ComposedBlock ID="cblock_3" HPOS="190" VPOS="1212"
WIDTH="305" HEIGHT="45">
<TextBlock ID="block_2" HPOS="190" VPOS="1212"
WIDTH="305" HEIGHT="45">
<TextLine ID="line_2" HPOS="190" VPOS="1212"
WIDTH="305" HEIGHT="45">
<String ID="string_8" HPOS="190"
VPOS="1212" WIDTH="93" HEIGHT="34" WC="0.95"
CONTENT="With"/>
<SP WIDTH="18" VPOS="1212" HPOS="283"/>
<String ID="string_9" HPOS="301"
VPOS="1223" WIDTH="44" HEIGHT="23" WC="0.95"
CONTENT="an"/>
<SP WIDTH="19" VPOS="1223" HPOS="345"/>
<String ID="string_10" HPOS="364"
VPOS="1214" WIDTH="131" HEIGHT="43" WC="0.95"
CONTENT="image."/>
</TextLine>
</TextBlock>
</ComposedBlock>
</PrintSpace>
</Page>
Schematron checks
<s:schema xmlns:s="http://purl.oclc.org/dsdl/schematron"
xmlns:t="http://xproc.org/ns/testsuite/3.0" queryBinding="xslt2">
<s:pattern>
<s:rule context="/">
<s:assert test="Page">Wrong document element</s:assert>
</s:rule>
</s:pattern>
<s:pattern>
<s:rule context="/Page">
<s:assert test="PrintSpace">No PrintSpace</s:assert>
<s:assert test="PrintSpace/ComposedBlock/TextBlock/TextLine/String">No string</s:assert>
</s:rule>
</s:pattern>
</s:schema>
Revision history
- 12 Jun 2026, Norm Tovey-Walsh
- Created test.