XML transformation, xpath search for selenium scraped pages/sources


#1

Sample bellow demonstates approach HTML page parsing and scrapping for Selenium page source.

<?xml version="1.0" encoding="UTF-8"?>
<config>

	<include-config code="Selenium_functions" />

	<script><![CDATA[
        import com.thoughtworks.selenium.*;
        import org.openqa.selenium.*;
        import java.util.concurrent.TimeUnit;
		 profilePage = "";
		 profilePageCase = "";

            ]]></script>

	<var-def name="xmlValue" />

	<var-def name="search_case_number">
		<template>${search_case_number}</template>
	</var-def>
 
	<var-def name="first_name">
		<template>${first_name}</template>
	</var-def>

	<var-def name="last_name">
		<template>${last_name}</template>
	</var-def>

	<var-def name="dob">
		<template>${dob}</template>
	</var-def>
	
	<var-def name="found_information" />

	<var-def name="xmlValueProfile" />

	<var-def name="linked_attorney"/>

	<var-def name="found_dob" />

	<var-def name="case_id" />

	<var-def name="linked_case_id" />

	<var-def name="department"/>

    <var-def name="offence_date"/>
					
    <var-def name="officer_badge"/>

	<var-def name="site_chaged">
		<template>false</template>
	</var-def>

	<var-def name="site_chaged">
		<template>false</template>
	</var-def>

	<var-def name="source_name">
		<template>Duval County Clerk of Courts</template>
	</var-def>

	<var-def name="source_search_url">
		<template>https://core.duvalclerk.com/CoreCms.aspx?mode=PublicAccess</template>
	</var-def>

	<var-def name="matched">
				<template>false</template>
	</var-def>
	
	<try>
		<body>

       <selenium-flow>
				<selenium name="seleniumDriver" browser="firefox" close-on-completion="false">
					<script><![CDATA[
             	
                driver = seleniumDriver.getWrappedObject();
                driver.manage().timeouts().implicitlyWait(30, TimeUnit.SECONDS).pageLoadTimeout(1, TimeUnit.MINUTES);
                driver.get(source_search_url.toString());
	            SeleniumFunctions.waitUntilVisible(By.id("form1"),30,driver).click();
				Thread.sleep(2000);
	            SeleniumFunctions.waitUntilClickable(By.xpath("//div[@id='ContentPlaceHolder1_c_TabContainerDiv']//tr[5]/td[@class='caseSearchFieldInput']/input[contains(@name,'LastName')]"),driver).sendKeys(new String[]{last_name.toString().trim()});
                Thread.sleep(2000);
	            SeleniumFunctions.waitUntilClickable(By.xpath("//div[@id='ContentPlaceHolder1_c_TabContainerDiv']//tr[5]/td[@class='caseSearchFieldInput']/input[contains(@name,'FirstName')]"),driver).sendKeys(new String[]{first_name.toString().trim()});
				Thread.sleep(2000);
	            SeleniumFunctions.waitUntilClickable(By.xpath("//div[@id='ContentPlaceHolder1_c_TabContainerDiv']//td[@class='caseSearchFooter']/input[@value='Begin Search']"),driver).click();
				Thread.sleep(2000);
	            SeleniumFunctions.waitUntilVisible(By.xpath("//table[@class='resultGrid']"),30,driver);
               
                profilePage =  driver.getPageSource();
               
            ]]></script>
				</selenium>
			</selenium-flow>


			<var-def name="xmlValue">
				<html-to-xml prunetags="head,meta" omitdoctype="true">
					<template>${profilePage}</template>
				</html-to-xml>
			</var-def>

			<case>
				<if condition='${site_chaged.toString().equals("false") &amp;&amp; !xmlValue.toString().contains("No results found for specified search criteria")}'>

					<var-def name="results">
						<xpath expression="//div[@class='displayOnly']//table[@class='resultGrid']//tr">
							<template>${xmlValue}</template>
						</xpath>
					</var-def>

					<var-def name="case_number" />

					<loop item="item">
						<list>
							<var name="results" />
						</list>
						<body>
							<empty>
								<var-def name="case_number">
									<xpath expression="//tr//div/text()">
										<var name="item" />
									</xpath>
								</var-def>

								<var-def name="dob_found">
									<xpath expression="//tr/td[2]/div/span[2]/text()">
										<var name="item" />
									</xpath>
								</var-def>

								<var-def name="information">
									<xpath expression="//tr/td[2]//span/text()">
										<var name="item" />
									</xpath>
								</var-def>

								<var-def name="caseid">
									<xpath expression="//tr//td[3]/span[1]/@id">
										<var name="item" />
									</xpath>
								</var-def>

								<script><![CDATA[
										            String search =  dob.toString().trim();
										            String found  =  case_number.toString().trim().replace("\n"," ").replaceAll("\\s+"," ").toUpperCase();
										            String foundDob  =  dob_found.toString().trim().replace("\n","").replaceAll("\\s+","").replace("DOB:","");
										            String foundInfo  =  information.toString().trim().replace("\n"," ").replaceAll("\\s+"," ");
										            String caseId = caseid.toString().trim();
										            if (search.equals(foundDob)) {
										              sys.defineVariable("matched","true",true);
										              sys.defineVariable("linked_case_id",case_number.toString().trim(),true);
										              sys.defineVariable("found_information",foundInfo,true);
										              String val = "//div[@class='igtab_THContent']//span[@id='" + caseId +"']/parent::*/parent::*/parent::*";
										              sys.defineVariable("case_id",val,true);
										             
										            }

										       ]]></script>
							</empty>
						</body>
					</loop>

					<script><![CDATA[
				try {
                  if (!case_id.toString().isEmpty()) {
                      SeleniumFunctions.waitUntilClickable(By.xpath(case_id.toString()),driver).click();
                      Thread.sleep(2000);
                      SeleniumFunctions.waitUntilVisible(By.xpath("//table[@id='c_CaseSummaryClerkGridView']"),30,driver);
                      Thread.sleep(2000);
                      profilePageCase =  driver.getPageSource();
                  }
				}
				catch (Exception ex) {

					 profilePageCase =  driver.getPageSource();
				}
            
            ]]></script>

					<var-def name="xmlValueProfile">
						<html-to-xml prunetags="head,meta" omitdoctype="true">
							<template>${profilePageCase}</template>
						</html-to-xml>
					</var-def>

					<var-def name="linked_attorney">
						<xpath expression="//div[@class='igtab_THContent']//table[@id='c_AttorneyBlock_c_AttorneyClerkGridView']//tr[2]/td/span/text()">
							<var name="xmlValueProfile" />
						</xpath>
					</var-def>

					<var-def name="department">
						<xpath expression="//div[@class='igtab_THContent']//table[@id='c_CaseSummaryClerkGridView']//tr[1]/td[4]/text()">
							<var name="xmlValueProfile" />
						</xpath>
					</var-def>

					<var-def name="offence_date">
						<xpath expression="//div[@class='igtab_THContent']//table[@id='c_CaseSummaryClerkGridView']//tr[2]/td[4]/text()">
							<var name="xmlValueProfile" />
						</xpath>
					</var-def>
					<var-def name="officer_badge">
						<xpath expression="//div[@class='igtab_THContent']//table[@id='c_CaseSummaryClerkGridView']//tr[3]/td[4]/text()">
							<var name="xmlValueProfile" />
						</xpath>
					</var-def>
				</if>
			</case>
			<script><![CDATA[
                driver.close();
                driver.quit();
            ]]></script>


		</body>
		<catch>
			<var-def name="site_chaged">
				<template>true</template>
			</var-def>
		</catch>
	</try>
	<var-def name="currentDate">
		<script return="result">
            <![CDATA[
                Date date = new Date();
                result = new java.text.SimpleDateFormat("yyyyMMdd").format(date);
            ]]>
		</script>
	</var-def>

	<var-def name="last_checked">
		<script return="result">
      <![CDATA[
          Date date = new Date();
          String result = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH-mm-ss.SSS").format(date);
      ]]>
		</script>
	</var-def>

	<var-def name="searchResultsLink">
		<s3 access-key="" secret-key="" bucket="rpa-tmp">
			<s3-put-public path="gis/search_duval_${currentDate}/search_${last_checked}.html" content="${xmlValue}" content-type="text/html" content-disposition="inline" />
		</s3>
	</var-def>

	<var-def name="profilePageLink">
		<s3 access-key="" secret-key="" bucket="rpa-tmp">
			<s3-put-public path="gis/search_duval_profile${currentDate}/search_${last_checked}.html" content="${xmlValueProfile}" content-type="text/html" content-disposition="inline" />
		</s3>
	</var-def>

	<!-- Export values to the output CSV file -->
	<export include-original-data="true">
		<single-column name="matched" value='${matched}' />
		<single-column name="found_dob" value='${found_dob}' />
		<single-column name="linked_case_id" value='${linked_case_id}' />
		<single-column name="linked_attorney" value='${linked_attorney}' />
		<single-column name="found_information" value='${found_information}' />
		<single-column name="site_chaged" value='${site_chaged}' />
		<single-column name="search_s3_link_2" value="${searchResultsLink}" />
		<single-column name="profile_s3_link_2" value="${profilePageLink}" />
		<single-column name="source_search_url" value="${source_search_url}" />
		<single-column name="source_name" value="${source_name}" />
		<single-column name="department" value="${department}" />
		<single-column name="offence_date" value="${offence_date}" />
		<single-column name="officer_badge" value="${officer_badge}" />
	</export>

</config>