How to strip HTML tags from a string in SQL Server?
There is a UDF that will do that described here:
User Defined Function to Strip HTML
CREATE FUNCTION [dbo].[udf_StripHTML] (@HTMLText VARCHAR(MAX))RETURNS VARCHAR(MAX) ASBEGIN DECLARE @Start INT DECLARE @End INT DECLARE @Length INT SET @Start = CHARINDEX('<',@HTMLText) SET @End = CHARINDEX('>',@HTMLText,CHARINDEX('<',@HTMLText)) SET @Length = (@End - @Start) + 1 WHILE @Start > 0 AND @End > 0 AND @Length > 0 BEGIN SET @HTMLText = STUFF(@HTMLText,@Start,@Length,'') SET @Start = CHARINDEX('<',@HTMLText) SET @End = CHARINDEX('>',@HTMLText,CHARINDEX('<',@HTMLText)) SET @Length = (@End - @Start) + 1 END RETURN LTRIM(RTRIM(@HTMLText))ENDGO
Edit: note this is for SQL Server 2005, but if you change the keyword MAX to something like 4000, it will work in SQL Server 2000 as well.
Derived from @Goner Doug answer, with a few things updated:
- using REPLACE where possible
- conversion of predefined entities like é
(I chose the ones I needed :-)
- some conversion of list tags <ul> and <li>
ALTER FUNCTION [dbo].[udf_StripHTML]--by Patrick Honorez --- www.idevlop.com--inspired by http://stackoverflow.com/questions/457701/best-way-to-strip-html-tags-from-a-string-in-sql-server/39253602#39253602(@HTMLText varchar(MAX))RETURNS varchar(MAX)ASBEGINDECLARE @Start intDECLARE @End intDECLARE @Length intset @HTMLText = replace(@htmlText, '<br>',CHAR(13) + CHAR(10))set @HTMLText = replace(@htmlText, '<br/>',CHAR(13) + CHAR(10))set @HTMLText = replace(@htmlText, '<br />',CHAR(13) + CHAR(10))set @HTMLText = replace(@htmlText, '<li>','- ')set @HTMLText = replace(@htmlText, '</li>',CHAR(13) + CHAR(10))set @HTMLText = replace(@htmlText, '’' collate Latin1_General_CS_AS, '''' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '"' collate Latin1_General_CS_AS, '"' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '&' collate Latin1_General_CS_AS, '&' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '€' collate Latin1_General_CS_AS, '€' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '<' collate Latin1_General_CS_AS, '<' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '>' collate Latin1_General_CS_AS, '>' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'œ' collate Latin1_General_CS_AS, 'oe' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, ' ' collate Latin1_General_CS_AS, ' ' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '©' collate Latin1_General_CS_AS, '©' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '«' collate Latin1_General_CS_AS, '«' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '®' collate Latin1_General_CS_AS, '®' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '±' collate Latin1_General_CS_AS, '±' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '²' collate Latin1_General_CS_AS, '²' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '³' collate Latin1_General_CS_AS, '³' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'µ' collate Latin1_General_CS_AS, 'µ' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '·' collate Latin1_General_CS_AS, '·' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'º' collate Latin1_General_CS_AS, 'º' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '»' collate Latin1_General_CS_AS, '»' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '¼' collate Latin1_General_CS_AS, '¼' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '½' collate Latin1_General_CS_AS, '½' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '¾' collate Latin1_General_CS_AS, '¾' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '&Aelig' collate Latin1_General_CS_AS, 'Æ' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'Ç' collate Latin1_General_CS_AS, 'Ç' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'È' collate Latin1_General_CS_AS, 'È' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'É' collate Latin1_General_CS_AS, 'É' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'Ê' collate Latin1_General_CS_AS, 'Ê' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'Ö' collate Latin1_General_CS_AS, 'Ö' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'à' collate Latin1_General_CS_AS, 'à' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'â' collate Latin1_General_CS_AS, 'â' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'ä' collate Latin1_General_CS_AS, 'ä' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'æ' collate Latin1_General_CS_AS, 'æ' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'ç' collate Latin1_General_CS_AS, 'ç' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'è' collate Latin1_General_CS_AS, 'è' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'é' collate Latin1_General_CS_AS, 'é' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'ê' collate Latin1_General_CS_AS, 'ê' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'ë' collate Latin1_General_CS_AS, 'ë' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'î' collate Latin1_General_CS_AS, 'î' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'ô' collate Latin1_General_CS_AS, 'ô' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'ö' collate Latin1_General_CS_AS, 'ö' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '÷' collate Latin1_General_CS_AS, '÷' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'ø' collate Latin1_General_CS_AS, 'ø' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'ù' collate Latin1_General_CS_AS, 'ù' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'ú' collate Latin1_General_CS_AS, 'ú' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'û' collate Latin1_General_CS_AS, 'û' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, 'ü' collate Latin1_General_CS_AS, 'ü' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '"' collate Latin1_General_CS_AS, '"' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '&' collate Latin1_General_CS_AS, '&' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '‹' collate Latin1_General_CS_AS, '<' collate Latin1_General_CS_AS)set @HTMLText = replace(@htmlText, '›' collate Latin1_General_CS_AS, '>' collate Latin1_General_CS_AS)-- Remove anything between <STYLE> tagsSET @Start = CHARINDEX('<STYLE', @HTMLText)SET @End = CHARINDEX('</STYLE>', @HTMLText, CHARINDEX('<', @HTMLText)) + 7SET @Length = (@End - @Start) + 1WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGINSET @HTMLText = STUFF(@HTMLText, @Start, @Length, '')SET @Start = CHARINDEX('<STYLE', @HTMLText)SET @End = CHARINDEX('</STYLE>', @HTMLText, CHARINDEX('</STYLE>', @HTMLText)) + 7SET @Length = (@End - @Start) + 1END-- Remove anything between <whatever> tagsSET @Start = CHARINDEX('<', @HTMLText)SET @End = CHARINDEX('>', @HTMLText, CHARINDEX('<', @HTMLText))SET @Length = (@End - @Start) + 1WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGINSET @HTMLText = STUFF(@HTMLText, @Start, @Length, '')SET @Start = CHARINDEX('<', @HTMLText)SET @End = CHARINDEX('>', @HTMLText, CHARINDEX('<', @HTMLText))SET @Length = (@End - @Start) + 1ENDRETURN LTRIM(RTRIM(@HTMLText))END
If your HTML is well formed, I think this is a better solution:
create function dbo.StripHTML( @text varchar(max) ) returns varchar(max) asbegin declare @textXML xml declare @result varchar(max) set @textXML = REPLACE( @text, '&', '' ); with doc(contents) as ( select chunks.chunk.query('.') from @textXML.nodes('/') as chunks(chunk) ) select @result = contents.value('.', 'varchar(max)') from doc return @resultendgoselect dbo.StripHTML('This <i>is</i> an <b>html</b> test')