Page 1 of 1

make plain text from HTML

Posted: 2008-05-08 14:28
by mattg
Been playing with this for a while.

I have a unusual localised industry specific e-mail client, that doesn't handle HTML only e-mails.
This script modifies the HTML into plain text, and removes the HTMLBody of the e-mail.

This script works for my unusual client, but doesn't make an e-mail that displays in Outlook 2003. This suits me, but it may need to be modified slightly for others.

Modify at will, use at own risk, etc.

Code: Select all

Sub OnDeliverMessage(oMessage)
	dim counter, aMessage

	for counter = 0 to oMessage.recipients.count -1
		if oMessage.recipients(counter).address = "[b]youruser@yourdomain.com[/b]" then
			aMessage = StripHTML(oMessage.HTMLBody)
			omessage.htmlbody = ""
			oMessage.body = aMessage
			oMessage.Save
		end if
	next	'counter
	Result.Value = 0
End Sub

function StripHTML(oSource)
dim Result_Text, breaks, tab, i

	Result_text = ReplaceText(oSource," ( )+"," ")

	Result_text = Replace(Result_text,"=" & vbcrlf,"")
	Result_text = Replace(Result_text,";" & vblrcf,"")
	'add more lines as above where needed
'	Result_text = Replace(Result_text,"=" & vblrcf,"")

	' Remove the header (prepare first by clearing attributes)
	Result_text = ReplaceText(Result_text,"<( )*head([^>])*>","<head>")
	Result_text = ReplaceText(Result_text,"(<( )*(/)( )*head( )*>)","</head>")
	Result_text = ReplaceText(Result_text,"(<head>)[\s\S]*(</head>)","")

	' remove all scripts (prepare first by clearing attributes)
	Result_text = ReplaceText(Result_text,"<( )*script([^>])*?>","<script>")
	Result_text = ReplaceText(Result_text,"(<( )*(/)( )*?script()*>)","</script>")
	Result_text = ReplaceText(Result_text,"(<script>)([^(<script>\.</script>)])*?(</script>)","")
	Result_text = ReplaceText(Result_text,"(<script>)[\s\S]*?(</script>)","")
        
	' remove all styles (prepare first by clearing attributes)
	Result_text = ReplaceText(Result_text,"<( )*style([^>])*?>","<style>")
	Result_text = ReplaceText(Result_text,"(<( )*(/)( )*?style( )*>)","</style>")
	Result_text = ReplaceText(Result_text,"(<style>)[\s\S]*?(</style>)","")

	' prepare spans by first clearing attributes
	Result_text = ReplaceText(Result_text,"<( )*span([^>])*?>","<span>")
	Result_text = ReplaceText(Result_text,"(<( )*(/)( )*?span( )*>)","</span>")

	' insert tabs in spaces of <td> tags
	Result_text = ReplaceText(Result_text,"<( )*td([^>])*>","\t")

	' insert line breaks in places of <BR> and <LI> tags
	Result_text = ReplaceText(Result_text,"<( )*br( )*>","\r\r")
	Result_text = ReplaceText(Result_text,"<( )*li( )*>","\r\r")

	' insert line paragraphs (double line breaks) in place if <P>, <DIV> and <TR> tags
	Result_text = ReplaceText(Result_text,"<( )*div([^>])*>","\r\r")
	Result_text = ReplaceText(Result_text,"<( )*tr([^>])*>","\r\r")
	Result_text = ReplaceText(Result_text,"<( )*p([^>])*>","\r\r")

	' Remove remaining tags like <a>, links, images, comments etc - anything thats enclosed inside < >
	Result_text = ReplaceText(Result_text,"<[\s\S]*?>","")

        ' replace special characters:
	Result_text = ReplaceText(Result_text,"&bull;"," * ")    
	Result_text = ReplaceText(Result_text,"&lsaquo;","<")    
	Result_text = ReplaceText(Result_text,"&rsaquo;",">")    
	Result_text = ReplaceText(Result_text,"&trade;","(tm)")      
	Result_text = ReplaceText(Result_text,"&frasl;","/")
	Result_text = ReplaceText(Result_text,"<","<")
	Result_text = ReplaceText(Result_text,">",">")    
	Result_text = ReplaceText(Result_text,"&copy;","(c)")        
	Result_text = ReplaceText(Result_text,"&reg;","(r)")

	' make line breaking consistent
	Result_text = Replace(Result_text,"\n","\r")

	' Remove extra line breaks and tabs:
	' Prepare first to remove any whitespaces inbetween
	' the escaped characters and remove redundant tabs inbetween linebreaks
	Result_text = ReplaceText(Result_text,"(\r)( )+(\r)","\r\r")
	Result_text = ReplaceText(Result_text,"(\t)( )+(\t)","\t\t")
	Result_text = ReplaceText(Result_text,"(\t)( )+(\r)","\t\r")
	Result_text = ReplaceText(Result_text,"(\r)( )+(\t)","\r\t")

	' Remove redundant tabs
	Result_text = ReplaceText(Result_text,"(\r)(\t)+(\r)","\r\r")

	' Remove multible tabs followind a linebreak with just one tab
	Result_text = ReplaceText(Result_text,"(\r)(\t)+","\r\t")

	' Initial replacement target string for linebreaks
	breaks = "\r\r"

	' Initial replacement target string for tabs
	tabs = "\t\t\t"

	for i=0 to 5 
		Result_text = Replace(Result_text, breaks, vbcrlf)
		Result_text = Replace(Result_text, tabs, "")
		Result_text = Replace(Result_text, vbcrlf & vbcrlf & vbcrlf, vbcrlf & vbcrlf)
	next 'i

	Result_text = Replace(Result_text, "&nbsp", "")

        ' Thats it.
      	StripHTML = Result_Text
End Function


Function ReplaceText(str1, patrn, replStr)
	Dim regEx
	Set regEx = New RegExp
	with regEx
		.Pattern = patrn
		.IgnoreCase = True
		.Global = True
	end with
	ReplaceText = regEx.Replace(str1, replStr)
End Function

Re: make plain text from HTML

Posted: 2012-07-27 02:48
by Bill48105
Hey Matt!
Old thread but wanted to say THANKS for this. Saved me having to write it myself. :D Few minor tweaks for my needs & had it stripping HTML for UPS email alerts sent to my cell text messages in just a couple minutes.

One thing I needed was to set the HTML to the plain version instead of empty like:

Code: Select all

oMessage.htmlbody = aMessage
oMessage.body = aMessage
Seems hmail doesn't remove the html MIME part & some email clients show blank email if set to "". Of course the choice would depend on email client.
Bill

Re: make plain text from HTML

Posted: 2012-07-27 04:08
by mattg
no worries mate

Re: make plain text from HTML

Posted: 2012-07-27 06:53
by dzekas
how about replacing all tags that need convertion to line feed first and then removing all tags.

list of special characters is a bit longer.

Re: make plain text from HTML

Posted: 2012-07-27 19:20
by dzekas
Bill48105 wrote:I wasn't attempting to critique or improve his script
Maybe there is some language barrier. It looks like simple suggestions are interpreted as critique here. :)

Re: make plain text from HTML

Posted: 2012-07-28 09:05
by mattg
It's 'all good' Dzekas.

You input is always valuable. You have a far higher technical skill than I. I always learn from what you say.

Re: make plain text from HTML

Posted: 2016-10-19 00:01
by atahir
Hello,

Sorry to necro post.

Is there a way to modify the above so that the lines are maintained.

Currently the script will strip everything out and put everything in to one line, can it be made to keep the text in multiple lines?

Thanks in advance for any help

Re: make plain text from HTML

Posted: 2016-10-19 16:03
by mattg
sure, just remove the regex lines that replace line endings with an empty string

Re: make plain text from HTML

Posted: 2016-10-19 22:50
by atahir
Thanks for the reply Matt, much appreciated.

Re: make plain text from HTML

Posted: 2016-10-20 03:59
by atahir
Not having much luck your suggestion, pretty sure I'm doing it wrong.

Don't suppose you're able to provide an example of the modification you mention?

Thanks

Re: make plain text from HTML

Posted: 2016-10-20 05:23
by mattg

Code: Select all

Sub OnDeliverMessage(oMessage)
   dim counter, aMessage

   for counter = 0 to oMessage.recipients.count -1
      if oMessage.recipients(counter).address = "[b]youruser@yourdomain.com[/b]" then
         aMessage = StripHTML(oMessage.HTMLBody)
         omessage.htmlbody = ""
         oMessage.body = aMessage
         oMessage.Save
      end if
   next   'counter
   Result.Value = 0
End Sub

function StripHTML(oSource)
dim Result_Text, breaks, tab, i

   Result_text = ReplaceText(oSource," ( )+"," ")

   Result_text = Replace(Result_text,"=" & vbcrlf,"")
   Result_text = Replace(Result_text,";" & vblrcf,"")
   'add more lines as above where needed
'   Result_text = Replace(Result_text,"=" & vblrcf,"")

   ' Remove the header (prepare first by clearing attributes)
   Result_text = ReplaceText(Result_text,"<( )*head([^>])*>","<head>")
   Result_text = ReplaceText(Result_text,"(<( )*(/)( )*head( )*>)","</head>")
   Result_text = ReplaceText(Result_text,"(<head>)[\s\S]*(</head>)","")

   ' remove all scripts (prepare first by clearing attributes)
   Result_text = ReplaceText(Result_text,"<( )*script([^>])*?>","<script>")
   Result_text = ReplaceText(Result_text,"(<( )*(/)( )*?script()*>)","</script>")
   Result_text = ReplaceText(Result_text,"(<script>)([^(<script>\.</script>)])*?(</script>)","")
   Result_text = ReplaceText(Result_text,"(<script>)[\s\S]*?(</script>)","")
       
   ' remove all styles (prepare first by clearing attributes)
   Result_text = ReplaceText(Result_text,"<( )*style([^>])*?>","<style>")
   Result_text = ReplaceText(Result_text,"(<( )*(/)( )*?style( )*>)","</style>")
   Result_text = ReplaceText(Result_text,"(<style>)[\s\S]*?(</style>)","")

   ' prepare spans by first clearing attributes
   Result_text = ReplaceText(Result_text,"<( )*span([^>])*?>","<span>")
   Result_text = ReplaceText(Result_text,"(<( )*(/)( )*?span( )*>)","</span>")

   ' insert tabs in spaces of <td> tags
   Result_text = ReplaceText(Result_text,"<( )*td([^>])*>","\t")

   ' insert line breaks in places of <BR> and <LI> tags
   Result_text = ReplaceText(Result_text,"<( )*br( )*>","\r\r")
   Result_text = ReplaceText(Result_text,"<( )*li( )*>","\r\r")

   ' insert line paragraphs (double line breaks) in place if <P>, <DIV> and <TR> tags
   Result_text = ReplaceText(Result_text,"<( )*div([^>])*>","\r\r")
   Result_text = ReplaceText(Result_text,"<( )*tr([^>])*>","\r\r")
   Result_text = ReplaceText(Result_text,"<( )*p([^>])*>","\r\r")

   ' Remove remaining tags like <a>, links, images, comments etc - anything thats enclosed inside < >
   Result_text = ReplaceText(Result_text,"<[\s\S]*?>","")

        ' replace special characters:
   Result_text = ReplaceText(Result_text,"&bull;"," * ")   
   Result_text = ReplaceText(Result_text,"&lsaquo;","<")   
   Result_text = ReplaceText(Result_text,"&rsaquo;",">")   
   Result_text = ReplaceText(Result_text,"&trade;","(tm)")     
   Result_text = ReplaceText(Result_text,"&frasl;","/")
   Result_text = ReplaceText(Result_text,"<","<")
   Result_text = ReplaceText(Result_text,">",">")   
   Result_text = ReplaceText(Result_text,"&copy;","(c)")       
   Result_text = ReplaceText(Result_text,"&reg;","(r)")

   ' make line breaking consistent
   Result_text = Replace(Result_text,"\n","\r")

   ' Remove extra line breaks and tabs:
   ' Prepare first to remove any whitespaces inbetween
   ' the escaped characters and remove redundant tabs inbetween linebreaks
   Result_text = ReplaceText(Result_text,"(\r)( )+(\r)","\r\r")
   Result_text = ReplaceText(Result_text,"(\t)( )+(\t)","\t\t")
   Result_text = ReplaceText(Result_text,"(\t)( )+(\r)","\t\r")
   Result_text = ReplaceText(Result_text,"(\r)( )+(\t)","\r\t")

   ' Remove redundant tabs
   Result_text = ReplaceText(Result_text,"(\r)(\t)+(\r)","\r\r")

   ' Remove multible tabs followind a linebreak with just one tab
  ' Result_text = ReplaceText(Result_text,"(\r)(\t)+","\r\t")                                     Probably just commenting out this line will do what you need.
  

   ' Initial replacement target string for linebreaks
   breaks = "\r\r"

   ' Initial replacement target string for tabs
   tabs = "\t\t\t"

   for i=0 to 5
      Result_text = Replace(Result_text, breaks, vbcrlf)
      Result_text = Replace(Result_text, tabs, "")
      Result_text = Replace(Result_text, vbcrlf & vbcrlf & vbcrlf, vbcrlf & vbcrlf)
   next 'i

   Result_text = Replace(Result_text, "&nbsp", "")

        ' Thats it.
         StripHTML = Result_Text
End Function


Function ReplaceText(str1, patrn, replStr)
   Dim regEx
   Set regEx = New RegExp
   with regEx
      .Pattern = patrn
      .IgnoreCase = True
      .Global = True
   end with
   ReplaceText = regEx.Replace(str1, replStr)
End Function
All of the VBCRLFs should be swapping HTML new line characters for VB new line commands. there should be heaps of new lines created
Just commenting out that one line I detail should add more line endings.

If not please post an example of the HTML you'd like fixed...

Re: make plain text from HTML

Posted: 2016-10-21 04:59
by atahir
Hi Matt,

Still having an issue, here is some further info:

Original Email sent as:

Code: Select all

 
<html dir=3D"ltr"><head><title></title></head><body>From: XX To: =
00000000000 </body></html><br /><br /><span style=3D>------------=
-----------------------------<br />Service Record #146974<br />Su=
mmary: Alert: Agent Status on User-PC is Failed -- <br />Company:=
 Test Company<br />Contact: Test User<br />Phone:   000000000=
00<br /></span>
Renders as:

From: XX
To: 61000000000

-----------------------------------------
Service Record #144177
Summary: Alert: Agent Status on User-PC is Normal --
Company: Test Company
Contact: Test User
Phone: 61000000000

After Script is applied:

Code: Select all



This is a multi-part message.

--__=_Part_Boundary_001_012927.014982
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: quoted-printable

From: XXTo: 61000000000-----------------------------------------Service Record #146962Summary: Alert: Agent Status on User-PC is Normal -- Company: Test CompanyContact: Test ContactPhone: 61000000000


--__=_Part_Boundary_001_012927.014982
Content-Type: text/html; charset="us-ascii"
Content-Transfer-Encoding: quoted-printable

From: XXTo: 61000000000-----------------------------------------Service Record #146962Summary: Alert: Agent Status on User-PC is Normal -- Company: Test CompanyContact: Test ContactPhone: 61000000000


--__=_Part_Boundary_001_012927.014982--

Appears as:

From: XXTo: 61000000000-----------------------------------------Service Record #146962Summary: Alert: Agent Status on User-PC is Normal -- Company: Test CompanyContact: Test ContactPhone: 61000000000


Ideally just trying to get the Text and different lines happening so that it looks like:

From: XX
To: 61000000000

-----------------------------------------
Service Record #144177
Summary: Alert: Agent Status on User-PC is Normal --
Company: Test Company
Contact: Test User
Phone: 61000000000



--
Thanks a bunch

Re: make plain text from HTML

Posted: 2016-10-24 00:52
by mattg
Just as a matter of interest, do you have SMTP >> RFC Compliance >> 'Allow incorrectly formatted line endings' selected??

What you want changed is not in the HTML text, but in the Message source.
Looking at some messages on my system, The longest line is typically about 80 characters..

I'm wondering if you allow incorrectly formatted line endings, if hMailsevrer will just sort this out for you automatically