When importing large amounts of data into SQL Server 2008 there are several choices. Most people working with SQL Server are aware of the BULK INSERT statement and the bcp utility. In the most simple case, we can specify a text file that contains information in CSV format and import it with a single SQL statement.
In many cases this might work well, yet there are also a lot of circumstances under which this approach is bound to fail. Especially when you are dealing with data that is delimited according to a different convention, such as escaped strings, single quotes instead of double quotes, binary data and the like.
Already simple tasks like skipping a certain column in output or parsing text in a different encoding on one column can be challenging. There is a simple way of doing the latter by using XML format files that can specify more complex import rules.
Here is an example:
<?xml version="1.0"?>
<BCPFORMAT xmlns="http://schemas.microsoft.com/sqlserver/2004/bulkload/format" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<RECORD>
<FIELD ID="1" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="2" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="3" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="4" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="5" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="6" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="7" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="8" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="9" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="10" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="11" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="12" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="13" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="14" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="15" xsi:type="CharTerm" TERMINATOR=";" COLLATION="Latin1_General_CI_AS"/>
<FIELD ID="16" xsi:type="CharTerm" TERMINATOR="\n" MAX_LENGTH="2000" COLLATION="Latin1_General_CI_AS"/>
</RECORD>
<ROW>
<COLUMN SOURCE="1" NAME="WarenwirtschaftsID" xsi:type="SQLINT"/>
<COLUMN SOURCE="2" NAME="Bezeichnung" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="3" NAME="Beschreibung" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="4" NAME="Vorname" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="5" NAME="Ort" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="6" NAME="PLZ" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="7" NAME="Land" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="8" NAME="Strasse" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="11" NAME="Telefon" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="14" NAME="Fax" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="15" NAME="Email" xsi:type="SQLVARYCHAR"/>
</ROW>
</BCPFORMAT>
The above serves as a format description for the following ETL script:
INSERT INTO Kunden ( Bezeichnung, Strasse, PLZ, Ort, Land, Telefon, Fax, Email, WarenwirtschaftsID,
Beschreibung, HotlineVertragVorhanden, ZeiterfassungsID, GeloeschtAm, Ansprechpartner )
SELECT Bezeichnung, Strasse, PLZ, Ort, Land, Telefon, Fax, Email, WarenwirtschaftsID,
Beschreibung, 0 AS HotlineVertragVorhanden, NULL AS ZeiterfassungsID, NULL AS GeloeschtAm,
CASE WHEN Vorname IS NOT NULL THEN Vorname + ' ' + Bezeichnung ELSE NULL END AS Ansprechpartner
FROM OPENROWSET(BULK '.....\KUNDEN.csv' ,
FORMATFILE = '.....\Kunden.xml',
FIRSTROW = 2, CODEPAGE = '1252'
) AS T ;
The neat thing about the BULK INSERT command is that you can do it in pure SQL (left aside the XML file) and therefore it can be easily automated as a task/job that runs on a regular basis.
If you are looking for more transformations, you have to resort to a programming language that allows for greater flexibility, such as C#.
In traditional transactional processing you would read each row of data from your data source, transform it and then insert it into the SQL Server data base. Due to the nature of transactional processing, this can be a very inefficient and slow process.
For large amounts of data, it makes sense to use a bulk load mechanism encapsulated in the SqlBulkCopy class. The following method uses a compressed file as an input stream, decompresses it and uses SqlBulkCopy to mass insert all data.
private static void ProcessLocations( ZipInputStream oZip )
{
using ( SqlConnection connection = new SqlConnection( SQL_SERVER_CONNECTION ) )
{
connection.Open();
SqlCommand cmd = connection.CreateCommand();
cmd.CommandText =
"IF EXISTS ( SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[T_Location]') AND type in (N'U')) " +
"TRUNCATE TABLE [dbo].[T_Location];" +
"ELSE " +
"CREATE TABLE T_Location ( LocationID INT PRIMARY KEY CLUSTERED, Country CHAR(2) NOT NULL, Region CHAR(2), City NVARCHAR(50), PostalCoce VARCHAR(50), Latitude REAL, Longitude REAL )";
cmd.CommandTimeout = 60;
cmd.ExecuteNonQuery();
/// open stream with ISO-8859-1 (Latin-1) encoding, do not close streamreader
StreamReader re = new StreamReader( oZip, Encoding.GetEncoding( 28591 ) );
using ( SqlBulkCopy oBulk = new SqlBulkCopy( connection ) )
{
oBulk.BatchSize =
oBulk.NotifyAfter = 50000;
oBulk.DestinationTableName = "T_Location";
oBulk.SqlRowsCopied += new SqlRowsCopiedEventHandler( delegate( object sender, SqlRowsCopiedEventArgs e ) { Output( String.Format( "{0:0,0} Rows imported.", e.RowsCopied ) ); } );
/// need to add a column mapping because we are only storing 7 out of 9 columns.
for ( int i = 0; i < 7; ++i )
oBulk.ColumnMappings.Add( i, i );
LocationCSVReader oBlocks = new LocationCSVReader( re, Output );
oBulk.WriteToServer( oBlocks );
}
}
}
How are files parsed and columns extracted? As you may have guessed, the LocationCSVReader:
private class LocationCSVReader : IDataReader
{
private StreamReader oReader = null;
private object[] oCurrentValues = new object[ 9 ];
private int iCurrentRow = 0;
private OutputEventHandler Output = null;
public delegate void OutputEventHandler( string s );
public LocationCSVReader( StreamReader oReader ) : this( oReader, null ) { }
public LocationCSVReader( StreamReader oReader, OutputEventHandler o )
{
this.oReader = oReader;
this.Output = o;
Output( "Skipping header\r\n" + this.oReader.ReadLine() + "\r\n" + this.oReader.ReadLine() );
this.iCurrentRow = 2;
}
#region IDataReader Members
public bool Read()
{
string input;
while ( ( input = this.oReader.ReadLine() ) != null )
{
++this.iCurrentRow;
input = input.Replace( "\"", "" );
string[] ss = input.Split( new char[] { ',' } );
if ( ss.Length != this.oCurrentValues.Length )
{
if ( this.Output != null )
this.Output( String.Format( "Invalid number of columns on line {0:0,0} - {1}", this.iCurrentRow, String.Join( " - ", ss ) ) );
continue;
}
for ( int i = 0; i < this.oCurrentValues.Length; ++i )
this.oCurrentValues[ i ] = ss[ i ];
this.oCurrentValues[ 0 ] = (Int32)( Int64.Parse( ss[ 0 ] ) - Int32.MaxValue );
this.oCurrentValues[ 5 ] = Double.Parse( ss[ 5 ], System.Globalization.NumberStyles.Float, CultureInfo.InvariantCulture );
this.oCurrentValues[ 6 ] = Double.Parse( ss[ 6 ], System.Globalization.NumberStyles.Float, CultureInfo.InvariantCulture );
return true;
}
return false;
}
// ...
#endregion
#region IDisposable Members
// ...
#endregion
#region IDataRecord Members
// ...
#endregion
}
One important thing to note: Consider that your code will be executed on a system with a different locale than yours. That’s frequently the point when things start to fall apart: E.g. when testing on a German server, things seem alright. Although when published on an English production server, “1,002″ suddenly becomes equivalent to “1.000″ and vice versa. Same problem exists with dates and times. Therefore whenever possible specify a specific culture or CultureInfo.InvariantCulture when parsing text values!