Fetch an image from mail body and using OCR extract all the details from image

email
hyperlink

#1

I am trying to fetch image links from mail body and perform OCR on it. Below is sample mail body which consists an image

I have used org.apache.poi.hsmf.MAPIMessage api, to get the mail body as text e.g

image

Can you please suggest how to extract image links and perform OCR on it.


#2

Hi @sd00465077 do you use drag-and-drop Recorder or do you code in WorkFusion Studio?


#3

i am using code for this…please find below my code snippet

<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://web-harvest.sourceforge.net/schema/1.0/config"
	scriptlang="groovy">


	<script><![CDATA[
		import org.apache.commons.vfs2.FileType;
		import java.io.File;
		import java.util.ArrayList;
		import org.apache.poi.hsmf.MAPIMessage;
		import org.apache.poi.hsmf.datatypes.AttachmentChunks;
		import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
		import java.io.FilenameFilter;
		import java.nio.file.Files;
		import org.apache.poi.ss.usermodel.Cell;
		import org.apache.poi.ss.usermodel.Row;
		import org.apache.poi.ss.usermodel.Sheet;
		import org.apache.poi.ss.usermodel.Workbook;
		import org.apache.poi.ss.util.CellReference;
		import org.apache.poi.xssf.streaming.SXSSFWorkbook;
		import com.google.gson.Gson;




		def filepath= "C:\\Users\\sd00465077\\Desktop\\Werner_emails\\";


		try{
			File file = new File(filepath);

			File[] files = file.listFiles(new FilenameFilter() {


						public boolean accept(File dir, String name) {
							if(name.toLowerCase().endsWith(".msg")){
								return true;
							} else {
								return false;
							}
						}
					});



			for(File f:files){
				def filename =f.getName();

				println(f.getName());
			}



			data = new ArrayList()
			sys.defineVariable("data", data);


			for(File f:files) {
				def filename = f.getName();


				Map rec = new HashMap();
				MAPIMessage msg = new MAPIMessage(filepath + filename);
				String attDirName = filename + "-att";

				rec.put("From", msg.getDisplayFrom().toString());
				rec.put("To",	msg.getDisplayTo().toString());
				rec.put("CC", msg.getDisplayCC().toString());
				rec.put("Subject", msg.getSubject().toString());
				rec.put("Email body", msg.getTextBody().toString());

				def Inline_attachment = msg.getTextBody().contains(".png")|msg.getTextBody().contains(".jpeg")|msg.getTextBody().contains(".jpg");

				rec.put("Email inline attachments", Inline_attachment);

				AttachmentChunks[] attachments = msg.getAttachmentFiles();
				if(attachments.length == 0){
					rec.put("attachments", "no attachments");
				}
				if(attachments.length > 0) {

					def temp_dir = "C:\\Users\\sd00465077\\Desktop\\Werner_emails\\Temp\\" + filename
					sys.defineVariable("temp_dir", temp_dir);

					File d = new File(temp_dir);

					if(d.mkdir()) {
						for(AttachmentChunks attachment : attachments) {
							String fileName1 = attachment.getAttachFileName().toString();

							if(attachment.getAttachLongFileName() != null) {
								fileName1 = attachment.getAttachLongFileName().toString();
							}

							rec.put("attachments", fileName1);
							File fi = new File(d, fileName1);

							OutputStream fileOut = null;
							try {
								fileOut = new FileOutputStream(fi);
								fileOut.write(attachment.getAttachData().getValue());
							} finally {
								if(fileOut != null) {
									fileOut.close();
								}
							}
						}
					} else {

						System.err.println("Can't create directory "+ attDirName);
					}
				}
				data.add(rec);
			}
		}


		catch (Exception e){
			System.out.println("Exception raised "+e.getMessage());
			mail_extract = new groovy.json.JsonBuilder(data)
		}

		mail_extract = new groovy.json.JsonBuilder(data)
	]]></script>



	<var-def name="CSV_report">
		<file path="CSV_report.csv" action="write" type="binary">
			<list-to-csv>
				<template>${mail_extract}</template>
			</list-to-csv>
		</file>
	</var-def>


	<export include-original-data="false">
		<!-- <single-column name="Attachment_links" value="${fileOut}" /> -->
	</export>
</config>

closed #4