HTML to XML transformation, xpath selectors for huge documents


#1

There are some issues using Xpath for huge documents, sometime document is valid but xpath does not return any values due document formatting issues, nestes comments, scripts and TDT slow validation.
This workaround can help to make it working

<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://web-harvest.sourceforge.net/schema/1.0/config" scriptlang="groovy">

	<var-def name="contentValue">
		<html-to-xml omitcomments="true" omithtmlenvelope="true" outputtype="pretty" prunetags="script,style">
			<file action="read" path="${file}" />
		</html-to-xml>
	</var-def>

<!-- correct usage --> 

	<xpath expression="//a">
		<var-def name="page">
			<template>&lt;root&gt;${contentValue.toString().replace("&lt;?xml version=\"1.0\" encoding=\"UTF-8\"?&gt;","")}&lt;/root&gt;</template>
		</var-def>
	</xpath>

<!-- bad, incorrect usage  -->
   <var-def name="rows">
        <xpath expression="//a">
            <html-to-xml>
                <file action="read" path="${file}"/>
            </html-to-xml>
        </xpath>
    </var-def>
    
</config>