tesseract-004

Tests that Alto XML output works

Test is expected to pass.

The pipeline

<p:declare-step xmlns:cx="http://xmlcalabash.com/ns/extensions"
                xmlns:p="http://www.w3.org/ns/xproc"
                xmlns:t="http://xproc.org/ns/testsuite/3.0" name="main" version="3.0">
   <p:import href="https://xmlcalabash.com/ext/library/pdf-steps.xpl"/>
   <p:import href="https://xmlcalabash.com/ext/library/tesseract.xpl"/>
   <p:output port="result"/>
   <cx:pdf-to-images dpi="300">
      <p:with-input port="source"
                    href="../documents/example.pdf"/>
   </cx:pdf-to-images>
   <cx:tesseract language="eng" output-format="alto"
                 debug-output="/dev/null"/>
</p:declare-step>

Result

<Page xmlns:t="http://xproc.org/ns/testsuite/3.0"
      WIDTH="2480" HEIGHT="3507" PHYSICAL_IMG_NR="0" ID="page_0">
   <PrintSpace HPOS="0" VPOS="0" WIDTH="2480" HEIGHT="3507">
      <ComposedBlock ID="cblock_0" HPOS="200" VPOS="237"
                     WIDTH="386" HEIGHT="64">
         <TextBlock ID="block_0" HPOS="200" VPOS="237"
                    WIDTH="386" HEIGHT="64">
            <TextLine ID="line_0" HPOS="200" VPOS="237"
                      WIDTH="386" HEIGHT="64">
               <String ID="string_0" HPOS="200" VPOS="237"
                       WIDTH="173" HEIGHT="63" WC="0.96" CONTENT="PDF"/>
               <SP WIDTH="31" VPOS="237" HPOS="373"/>
               <String ID="string_1" HPOS="404" VPOS="237"
                       WIDTH="182" HEIGHT="64" WC="0.96"
                       CONTENT="Text"/>
            </TextLine>
         </TextBlock>
      </ComposedBlock>
      <ComposedBlock ID="cblock_1" HPOS="191" VPOS="387"
                     WIDTH="651" HEIGHT="44">
         <TextBlock ID="block_1" HPOS="191" VPOS="387"
                    WIDTH="651" HEIGHT="44">
            <TextLine ID="line_1" HPOS="191" VPOS="387"
                      WIDTH="651" HEIGHT="44">
               <String ID="string_2" HPOS="191" VPOS="387"
                       WIDTH="80" HEIGHT="34" WC="0.96"
                       CONTENT="This"/>
               <SP WIDTH="18" VPOS="387" HPOS="271"/>
               <String ID="string_3" HPOS="289" VPOS="389"
                       WIDTH="27" HEIGHT="32" WC="0.96" CONTENT="is"/>
               <SP WIDTH="31" VPOS="389" HPOS="316"/>
               <String ID="string_4" HPOS="347" VPOS="398"
                       WIDTH="5" HEIGHT="23" WC="0.96" CONTENT="a"/>
               <SP WIDTH="17" VPOS="398" HPOS="352"/>
               <String ID="string_5" HPOS="369" VPOS="387"
                       WIDTH="143" HEIGHT="44" WC="0.96"
                       CONTENT="sample"/>
               <SP WIDTH="19" VPOS="387" HPOS="512"/>
               <String ID="string_6" HPOS="531" VPOS="389"
                       WIDTH="82" HEIGHT="32" WC="0.96" CONTENT="PDF"/>
               <SP WIDTH="16" VPOS="389" HPOS="613"/>
               <String ID="string_7" HPOS="629" VPOS="387"
                       WIDTH="213" HEIGHT="34" WC="0.95"
                       CONTENT="document."/>
            </TextLine>
         </TextBlock>
      </ComposedBlock>
      <Illustration ID="cblock_2" HPOS="206" VPOS="578"
                    WIDTH="466" HEIGHT="478"/>
      <ComposedBlock ID="cblock_3" HPOS="190" VPOS="1212"
                     WIDTH="305" HEIGHT="45">
         <TextBlock ID="block_2" HPOS="190" VPOS="1212"
                    WIDTH="305" HEIGHT="45">
            <TextLine ID="line_2" HPOS="190" VPOS="1212"
                      WIDTH="305" HEIGHT="45">
               <String ID="string_8" HPOS="190"
                       VPOS="1212" WIDTH="93" HEIGHT="34" WC="0.95"
                       CONTENT="With"/>
               <SP WIDTH="18" VPOS="1212" HPOS="283"/>
               <String ID="string_9" HPOS="301"
                       VPOS="1223" WIDTH="44" HEIGHT="23" WC="0.95"
                       CONTENT="an"/>
               <SP WIDTH="19" VPOS="1223" HPOS="345"/>
               <String ID="string_10" HPOS="364"
                       VPOS="1214" WIDTH="131" HEIGHT="43" WC="0.95"
                       CONTENT="image."/>
            </TextLine>
         </TextBlock>
      </ComposedBlock>
   </PrintSpace>
</Page>

Schematron checks

<s:schema xmlns:s="http://purl.oclc.org/dsdl/schematron"
          xmlns:t="http://xproc.org/ns/testsuite/3.0" queryBinding="xslt2">
   <s:pattern>
      <s:rule context="/">
         <s:assert test="Page">Wrong document element</s:assert>
      </s:rule>
   </s:pattern>
   <s:pattern>
      <s:rule context="/Page">
         <s:assert test="PrintSpace">No PrintSpace</s:assert>
         <s:assert test="PrintSpace/ComposedBlock/TextBlock/TextLine/String">No string</s:assert>
      </s:rule>
   </s:pattern>
</s:schema>

Revision history

12 Jun 2026, Norm Tovey-Walsh
Created test.