Friday, February 24, 2012

Extracting words from strings

I've done up this script which I'll turn into a user-defined function for
extracting words from text. However it seems a little over-complicated.
Anyone got any ideas of how to simplify it?
Thanks
Damien
SET NOCOUNT ON
DECLARE @.word TINYINT
DECLARE @.search_string VARCHAR( 2000 )
DECLARE @.delimiter CHAR( 1 )
-- Initialise variables
SET @.word = 1
SET @.search_string = 'This is a test function'
SET @.delimiter = ' '
SET @.search_string = @.search_string
DECLARE @.return VARCHAR( 25 )
DECLARE @.string CHAR( 1 )
DECLARE @.pos SMALLINT
DECLARE @.old_pos SMALLINT
DECLARE @.start SMALLINT
DECLARE @.total_len SMALLINT
DECLARE @.words SMALLINT
DECLARE @.len SMALLINT
-- Initialise variables
SET @.start = 1
SET @.old_pos = 1
SET @.pos = 1
SET @.words = 1
SET @.len = 0
SET @.total_len = LEN( @.search_string )
-- Check for errors
IF @.total_len = 0
BEGIN
RAISERROR( 'Invalid search string: %s.', 16, 1, @.search_string )
END
-- SELECT @.search_string AS '12345678901234'
-- Loop thru string one character at a time
WHILE @.pos <= @.total_len
BEGIN
SET @.string = SUBSTRING( @.search_string, @.pos, 1 )
-- Check we're not at the end of the string
IF @.word > @.words BREAK
IF @.pos = @.total_len
BEGIN
SET @.start = @.old_pos
SET @.len = @.pos - @.old_pos + 1
BREAK
END
-- Found a space; any previous text is the first word, any further text
is further word
IF @.string = @.delimiter
BEGIN
IF @.words = @.word
BEGIN
-- Current word is same as requested word
SET @.start = @.old_pos
SET @.len = @.pos - @.old_pos
BREAK
END
ELSE
BEGIN
SET @.old_pos = @.pos + 1
END
SET @.words = @.words + 1
END
-- Debuggin
--SELECT @.pos AS '@.pos', @.string AS '@.string', @.start AS '@.start',
@.old_pos AS '@.old_pos', @.word AS '@.word', @.words AS '@.words', @.len AS '@.len'
-- Increment current position
SET @.pos = @.pos + 1
END
-- Set return value
SET @.return = SUBSTRING( @.search_string, @.start, @.len )
function_exit:
-- RETURN @.return
SELECT @.return, LEN( @.return )
SET NOCOUNT OFFTry this....
if exists (select 'Y' from dbo.sysobjects where id =
object_id(N'[dbo].[fn_Split]'))
BEGIN
Drop Function fn_Split
END
GO
Create Function fn_Split
(
@.SplitString text,
@.Delimiter VARCHAR(1) = ' '
) RETURNS @.Dictionary TABLE (word varchar(8000)) AS
BEGIN
DECLARE
@.ParsedString VARCHAR(8000),
@.element VARCHAR(8000),
@.start_pos int,
@.end_pos int,
@.DataLength int,
@.BytesProcessed int,
@.length int
SELECT
@.BytesProcessed = 0,
@.DataLength = DataLength(@.SplitString)
SELECT @.ParsedString = Substring(@.SplitString, @.BytesProcessed + 1, 8000)
WHILE @.BytesProcessed < @.DataLength BEGIN
SELECT
@.BytesProcessed = @.BytesProcessed + DataLength(@.ParsedString)
SELECT
@.start_pos = 0
SELECT @.end_pos = CHARINDEX(@.Delimiter, @.ParsedString, @.start_pos + 1)
WHILE @.end_pos <> 0 BEGIN
SELECT @.length = (@.end_pos - @.start_pos) - 1
SELECT
@.element = SUBSTRING(@.ParsedString, @.start_pos + 1, @.length),
@.start_pos = @.end_pos
INSERT INTO @.Dictionary
VALUES (@.element)
SELECT @.end_pos = CHARINDEX(@.Delimiter, @.ParsedString, @.start_pos + 1)
END
SELECT @.ParsedString = SUBSTRING(@.ParsedString, @.start_pos + 1, 8000)
SELECT @.ParsedString = @.ParsedString + Substring(@.SplitString,
@.BytesProcessed + 1, 8000 - DataLength(@.ParsedString))
END
SELECT @.element = @.ParsedString
IF @.element IS NOT NULL
INSERT INTO @.Dictionary
VALUES (@.element)
RETURN
END
GO
select * from dbo.fn_Split('This is a test function',' ')|||I can't follow exactly what you're trying to do in that code but it looks
like you're trying to return the string of searched text from a string being
searched. For example, searching for "B" in "ABC".
In that case you could do something like:
DECLARE
@.string VARCHAR(10)
, @.search VARCHAR(10)
SELECT
@.string = 'ABC'
, @.search = 'B'
SELECT SUBSTRING( @.string, CHARINDEX(@.search, @.string), LEN(@.search) )
That code will return "B"; if you searched for "D" it would return an empty
string.
But doing that doesn't really make sense, because you already know the
string you're searching for. Couldn't you just use CHARINDEX(), and if it
returns > 0 reuse the string that is the text you're searching for?
Only caveat in all that is that it is not case sensitive for
case-insentitive collations.
"Damien" wrote:

> I've done up this script which I'll turn into a user-defined function for
> extracting words from text. However it seems a little over-complicated.
> Anyone got any ideas of how to simplify it?
> Thanks
>
> Damien
>
> SET NOCOUNT ON
> DECLARE @.word TINYINT
> DECLARE @.search_string VARCHAR( 2000 )
> DECLARE @.delimiter CHAR( 1 )
> -- Initialise variables
> SET @.word = 1
> SET @.search_string = 'This is a test function'
> SET @.delimiter = ' '
> SET @.search_string = @.search_string
> DECLARE @.return VARCHAR( 25 )
> DECLARE @.string CHAR( 1 )
> DECLARE @.pos SMALLINT
> DECLARE @.old_pos SMALLINT
> DECLARE @.start SMALLINT
> DECLARE @.total_len SMALLINT
> DECLARE @.words SMALLINT
> DECLARE @.len SMALLINT
> -- Initialise variables
> SET @.start = 1
> SET @.old_pos = 1
> SET @.pos = 1
> SET @.words = 1
> SET @.len = 0
> SET @.total_len = LEN( @.search_string )
> -- Check for errors
> IF @.total_len = 0
> BEGIN
> RAISERROR( 'Invalid search string: %s.', 16, 1, @.search_string )
> END
> -- SELECT @.search_string AS '12345678901234'
> -- Loop thru string one character at a time
> WHILE @.pos <= @.total_len
> BEGIN
> SET @.string = SUBSTRING( @.search_string, @.pos, 1 )
> -- Check we're not at the end of the string
> IF @.word > @.words BREAK
> IF @.pos = @.total_len
> BEGIN
> SET @.start = @.old_pos
> SET @.len = @.pos - @.old_pos + 1
> BREAK
> END
> -- Found a space; any previous text is the first word, any further tex
t
> is further word
> IF @.string = @.delimiter
> BEGIN
> IF @.words = @.word
> BEGIN
> -- Current word is same as requested word
> SET @.start = @.old_pos
> SET @.len = @.pos - @.old_pos
> BREAK
> END
> ELSE
> BEGIN
> SET @.old_pos = @.pos + 1
> END
> SET @.words = @.words + 1
> END
> -- Debuggin
> --SELECT @.pos AS '@.pos', @.string AS '@.string', @.start AS '@.start',
> @.old_pos AS '@.old_pos', @.word AS '@.word', @.words AS '@.words', @.len AS '@.le
n'
> -- Increment current position
> SET @.pos = @.pos + 1
> END
>
> -- Set return value
> SET @.return = SUBSTRING( @.search_string, @.start, @.len )
>
> function_exit:
> -- RETURN @.return
> SELECT @.return, LEN( @.return )
> SET NOCOUNT OFF
>|||Use a table of sequential numbers for this purpose instead of loop routines.
You can create one easily using the hack:
SELECT TOP 2000 IDENTITY ( INT ) n INTO Nbrs
FROM sysobjects s1, sysobjects s2 ;
Now your requirement is simpler:
DECLARE @.search_string VARCHAR( 2000 )
DECLARE @.delimiter CHAR( 1 )
SET @.search_string = 'This is a test function'
SET @.delimiter = ' '
SELECT SUBSTRING( @.search_string, n, CHARINDEX( @.delimiter,
@.search_string + @.delimiter, n ) - n )
FROM Nbrs
WHERE SUBSTRING( @.delimiter + @.search_string, n, 1 ) = @.delimiter
AND n < LEN( @.search_string ) + 1 ;
Anith

No comments:

Post a Comment