As a compliment to my MaxMind GeoIP Locator scripts and the parse top-level domain from URL function, I've created another function to parse the search term from an URL.
I've heard that T-SQL is a bit slow with parsing strings and wanted to test this myself so I created this in both C# and T-SQL. The difference is very noticeable, indeed: while the original, unoptimized function I wrote in T-SQL took 11 seconds to parse 10,000 referral URLs, the C# version took 4 seconds. The updated, more efficient function takes 6 seconds to run in T-SQL and 3 seconds in C#. Below are two truncated versions of the functions I created. I slimmed them down to make them easier to read on the blog but you can download the full versions here: qscleaner.cs and qscleaner.sql. If you do not have Visual Studio.net, scroll to the bottom of the post -- there I link the qscleaner.dll and give sample CREATE ASSEMBLY syntax.
QSCleaner.cs
/*
Created by Chrissy LeMaire (clemaire@gmail.com)
Website: http://netnerds.net/
! - THIS IS THE TRUNCATED VERSION created to keep the blog posting size down. It works but not as
!- precisely. Please download the full version @ http://blog.netnerds.net/code/qscleaner.cs
This C# UDF does the following
1. Guesses the search engine's search keyword delimiter
2. Finds the term and extracts it
3. Replaces querystring percentage encoding with its human readable equiv.
NO WARRANTIES, USE THIS AT YOUR OWN RISK, etc.
*/
using System;
using System.Data;
using System.Data.SqlClient;
using System.Data.SqlTypes;
using Microsoft.SqlServer.Server;
public partial class UserDefinedFunctions
{
[Microsoft.SqlServer.Server.SqlFunction]
public static SqlString QSCleaner(string strURL)
{
int ampStart, slashStart, delimiterStart;
string strQueryString = "";
string strDelimiter = "";
strURL = strURL.Replace("aq=", "aa=");
strURL = strURL.Replace("as_q=", "aa=");
strURL = strURL.Replace("as_oq=", "aa=");
strURL = strURL.Replace("as_eq=", "aa=");
while (strDelimiter.Length == 0)
{
// most used search delimiters
if (strURL.IndexOf("q=") > 0)
{ strDelimiter = "q="; break; }
if (strURL.IndexOf("p=") > 0)
{ strDelimiter = "p="; break; }
if (strURL.IndexOf("query=") > 0)
{ strDelimiter = "="; break; }
if (strURL.IndexOf("search=") > 0)
{ strDelimiter = "search="; break; }
if (strURL.IndexOf("qry=") > 0)
{ strDelimiter = "qry="; break; }
if (strURL.IndexOf("p=") > 0)
{ strDelimiter = "p="; break; }
if (strURL.IndexOf("words=") > 0)
{ strDelimiter = "words="; break; }
if (strURL.IndexOf("word=") > 0)
{ strDelimiter = "word="; break; }
if (strURL.IndexOf("MT=") > 0)
{ strDelimiter = "MT="; break; }
if (strURL.IndexOf("s=") > 0)
{ strDelimiter = "s="; break; }
if (strURL.IndexOf("key=") > 0)
{ strDelimiter = "key="; break; }
// end most used
//if by this time it isnt set.. it's not likely a search string
if (strDelimiter.Length == 0) strDelimiter = "unknown";
}
if (strURL.IndexOf(strDelimiter) > 0)
{
try
{
delimiterStart = strURL.IndexOf(strDelimiter) + strDelimiter.Length;
strQueryString = strURL.Substring(delimiterStart, (strURL.Length - delimiterStart));
ampStart = strQueryString.IndexOf("&");
if (ampStart > 0) strQueryString = strQueryString.Substring(0, ampStart);
slashStart = strQueryString.IndexOf("/");
if (slashStart > 0) strQueryString = strQueryString.Substring(0, slashStart - 1);
// because I can't get System.Web.HttpUtility.UrlDecode to work
strQueryString = strQueryString.Replace("%20", " ");
strQueryString = strQueryString.Replace("%21", "!");
strQueryString = strQueryString.Replace("%22", """);
strQueryString = strQueryString.Replace("%23", "#");
strQueryString = strQueryString.Replace("%24", "$");
strQueryString = strQueryString.Replace("%25", "%");
strQueryString = strQueryString.Replace("%26", "&");
strQueryString = strQueryString.Replace("%27", "''");
strQueryString = strQueryString.Replace("%28", "(");
strQueryString = strQueryString.Replace("%29", ")");
strQueryString = strQueryString.Replace("%2A", "*");
strQueryString = strQueryString.Replace("%2B", "+");
strQueryString = strQueryString.Replace("%2C", ",");
strQueryString = strQueryString.Replace("%2D", "-");
strQueryString = strQueryString.Replace("%2E", ".");
strQueryString = strQueryString.Replace("%2F", "/");
strQueryString = strQueryString.Replace("%3A", ":");
strQueryString = strQueryString.Replace("%3B", ";");
strQueryString = strQueryString.Replace("%3C", "<");
strQueryString = strQueryString.Replace("%3D", "=");
strQueryString = strQueryString.Replace("%3E", ">");
strQueryString = strQueryString.Replace("%3F", "?");
strQueryString = strQueryString.Replace("%40", "@");
strQueryString = strQueryString.Replace("%5B", "[");
strQueryString = strQueryString.Replace("%5C", "");
strQueryString = strQueryString.Replace("%5D", "]");
strQueryString = strQueryString.Replace("%5E", "^");
strQueryString = strQueryString.Replace("%5F", "_");
strQueryString = strQueryString.Replace("%60", "`");
strQueryString = strQueryString.Replace("%7B", "{");
strQueryString = strQueryString.Replace("%7C", "|");
strQueryString = strQueryString.Replace("%7D", "}");
strQueryString = strQueryString.Replace("+", " ");
}
catch
{
strQueryString = strURL;
}
}
else
strQueryString = strURL;
return new SqlString(strQueryString);
}
};
If you don't have access to SQL Server 2005 or SQL 2k5 CLR, you can still use a T-SQL function.
QSCleaner.sql
/*
Created by Chrissy LeMaire (clemaire@gmail.com)
Website: http://netnerds.net/
! - THIS IS THE TRUNCATED VERSION created to keep the blog posting size down. It works but not as
!- precisely. Please download the full version @ http://blog.netnerds.net/code/qscleaner.sql
This T-SQL Function does the following
1. Guesses the search engine's search keyword delimiter
2. Finds the term and extracts it
3. Replaces querystring percentage encoding with its human readable equiv.
NO WARRANTIES, USE THIS AT YOUR OWN RISK, etc.
*/
CREATE FUNCTION [dbo].[QSCleaner] (@strURL varchar(512))
RETURNS varchar(255)
AS
BEGIN
DECLARE @ampStart int
DECLARE @queryStr varchar(512)
DECLARE @slashStart int
DECLARE @strDelimiter varchar(64)
--Clean up some stuff that gets in the way of q=
SET @strURL = REPLACE(@strURL, 'aq=', 'aa=')
SET @strURL = REPLACE(@strURL, 'as_q=', 'aa=')
SET @strURL = REPLACE(@strURL, 'as_oq=', 'aa=')
SET @strURL = REPLACE(@strURL, 'as_eq=', 'aa=')
--Get Delimiter
WHILE @strDelimiter IS NULL
BEGIN
-- most used search delimiters
IF CHARINDEX('q=',@strURL) > 0
BEGIN
SET @strDelimiter = 'q='
BREAK
END
IF CHARINDEX('p=',@strURL) > 0
BEGIN
SET @strDelimiter = 'p='
BREAK
END
IF CHARINDEX('query=',@strURL) > 0
BEGIN
SET @strDelimiter = '='
BREAK
END
IF CHARINDEX('search=',@strURL) > 0
BEGIN
SET @strDelimiter = 'search='
BREAK
END
IF CHARINDEX('qry=',@strURL) > 0
BEGIN
SET @strDelimiter = 'qry='
BREAK
END
IF CHARINDEX('p=',@strURL) > 0
BEGIN
SET @strDelimiter = 'p='
BREAK
END
IF CHARINDEX('words=',@strURL) > 0
BEGIN
SET @strDelimiter = 'words='
BREAK
END
IF CHARINDEX('word=',@strURL) > 0
BEGIN
SET @strDelimiter = 'word='
BREAK
END
IF CHARINDEX('MT=',@strURL) > 0
BEGIN
SET @strDelimiter = 'MT='
BREAK
END
IF CHARINDEX('s=',@strURL) > 0
BEGIN
SET @strDelimiter = 's='
BREAK
END
IF CHARINDEX('key=',@strURL) > 0
BEGIN
SET @strDelimiter = 'key='
BREAK
END
-- end most used
IF @strDelimiter IS NULL SET @strDelimiter = ''
END
IF CHARINDEX(@strDelimiter,@strURL) > 0
BEGIN
SET @queryStr = SUBSTRING(@strURL,CHARINDEX(@strDelimiter,@strURL)+LEN(@strDelimiter),LEN(@strURL))
SET @ampStart = CHARINDEX('&',@queryStr)
IF @ampStart > 0 SET @queryStr = LEFT(@queryStr,@ampStart-1)
SET @slashStart = CHARINDEX('/',@queryStr)
IF @slashStart > 0 SET @queryStr = LEFT(@queryStr,@slashStart-1)
SET @queryStr = REPLACE(@queryStr, '%20', ' ')
SET @queryStr = REPLACE(@queryStr, '%21', '!')
SET @queryStr = REPLACE(@queryStr, '%22', '"')
SET @queryStr = REPLACE(@queryStr, '%23', '#')
SET @queryStr = REPLACE(@queryStr, '%24', '$')
SET @queryStr = REPLACE(@queryStr, '%25', '%')
SET @queryStr = REPLACE(@queryStr, '%26', '&')
SET @queryStr = REPLACE(@queryStr, '%27', '''')
SET @queryStr = REPLACE(@queryStr, '%28', '(')
SET @queryStr = REPLACE(@queryStr, '%29', ')')
SET @queryStr = REPLACE(@queryStr, '%2A', '*')
SET @queryStr = REPLACE(@queryStr, '%2B', '+')
SET @queryStr = REPLACE(@queryStr, '%2C', ',')
SET @queryStr = REPLACE(@queryStr, '%2D', '-')
SET @queryStr = REPLACE(@queryStr, '%2E', '.')
SET @queryStr = REPLACE(@queryStr, '%2F', '/')
SET @queryStr = REPLACE(@queryStr, '%3A', ':')
SET @queryStr = REPLACE(@queryStr, '%3B', ';')
SET @queryStr = REPLACE(@queryStr, '%3C', '<')
SET @queryStr = REPLACE(@queryStr, '%3D', '=')
SET @queryStr = REPLACE(@queryStr, '%3E', '>')
SET @queryStr = REPLACE(@queryStr, '%3F', '?')
SET @queryStr = REPLACE(@queryStr, '%40', '@')
SET @queryStr = REPLACE(@queryStr, '%5B', '[')
SET @queryStr = REPLACE(@queryStr, '%5C', '')
SET @queryStr = REPLACE(@queryStr, '%5D', ']')
SET @queryStr = REPLACE(@queryStr, '%5E', '^')
SET @queryStr = REPLACE(@queryStr, '%5F', '_')
SET @queryStr = REPLACE(@queryStr, '%60', '`')
SET @queryStr = REPLACE(@queryStr, '%7B', '{')
SET @queryStr = REPLACE(@queryStr, '%7C', '|')
SET @queryStr = REPLACE(@queryStr, '%7D', '}')
SET @queryStr = REPLACE(@queryStr, '+', ' ')
END
if @queryStr is null SET @queryStr = @strURL
RETURN @queryStr
END
If you do not have VS.NET but still wish to use the C# version of qscleaner, you can download the dll here: qscleaner.dll or you can email me for the SLN file and compile it yourself using msbuild which is included in the .NET 2.0 framework. Anyway, save the dll to the DATA directory that contains your .mdf files (for example: C:\Program Files\Microsoft SQL Server\MSSQL.1\MSSQL\Data). To "install" the UDF, run the following:
CREATE ASSEMBLY QSCleaner
FROM 'C:\Program Files\Microsoft SQL Server\MSSQL.1\MSSQL\Data\qscleaner.dll'
WITH PERMISSION_SET = SAFE
GO
CREATE FUNCTION dbo.QSCleaner(@strURL nvarchar) RETURNS nvarchar
AS EXTERNAL NAME QSCleaner.UserDefinedFunctions.QSCleaner
GO
To call the function, create a query that's something like this: SELECT TOP 1000 dbo.qscleaner(referer) as keywords FROM hitcounter WHERE referer IS NOT NULL ORDER BY id DESC
Actually, I take it back. The above dll/assembly/function creation doesn't work -- I just receive 1000 h's all the way down. Compiling it via VS.NET works and the T-SQL function works but the manual creation does not. If anyone can shed some light, I'd appreciate it!