How can I convert values stored in ANSI (Windows 1252) in a database to UTF-8 How can I convert values stored in ANSI (Windows 1252) in a database to UTF-8 sqlite sqlite

How can I convert values stored in ANSI (Windows 1252) in a database to UTF-8


I had the same problem once,

John Skeet answered it here:

Basically take the string, get the bytes in the wrong encoding that it was encoded as, then get the string in the encoding that it really was:

string broken = "Brokers México, Intermediario de Aseguro,S.A."; // Get text from databasebyte[] encoded = Encoding.GetEncoding(28591).GetBytes(broken);string corrected = Encoding.UTF8.GetString(encoded);

So yours should simply be

string broken = "Whatever";byte[] encoded = Encoding.GetEncoding(1252).GetBytes(broken);string corrected = Encoding.UTF8.GetString(encoded);

Basically, now that you know that the re-conversion program is correct, I'd play around with the encodings mentioned here:
https://msdn.microsoft.com/en-us/library/system.text.encodinginfo.getencoding(v=vs.110).aspx
(just write a program that tests-through all the likely possibilities listed there, and see which pair will yield a match...)

If you know the source text, you could even perform the checking automagically:

public partial class Form1 : Form{    public System.Data.DataTable dt;    public Form1()    {        InitializeComponent();    }    private void btnTest_Click(object sender, EventArgs e)    {        dt = new System.Data.DataTable();        string correct = "Brokers México, Intermediario de Aseguro,S.A.";        string broken = "Brokers México, Intermediario de Aseguro,S.A."; // Get text from database        dt.Columns.Add("SourceEncoding", typeof(string));        dt.Columns.Add("TargetEncoding", typeof(string));        dt.Columns.Add("Result", typeof(string));        dt.Columns.Add("SourceEncodingName", typeof(string));        dt.Columns.Add("TargetEncodingName", typeof(string));        // For reference        // https://msdn.microsoft.com/en-us/library/system.text.encodinginfo.getencoding(v=vs.110).aspx        int[] encs = new int[] {              20127 // US-ASCII            ,28591 // iso-8859-1 Western European (ISO)                   ,28592 // iso-8859-2 Central European (ISO)                   ,28593 // iso-8859-3 Latin 3 (ISO)            ,28594 // iso-8859-4 Baltic (ISO)            ,28595 // iso-8859-5 Cyrillic (ISO)            ,28596 // iso-8859-6 Arabic (ISO)            ,28597 // iso-8859-7 Greek (ISO)            ,28598 // iso-8859-8 Hebrew (ISO-Visual)                      ,28599 // iso-8859-9 Turkish (ISO)            ,28603 // iso-8859-13 Estonian (ISO)            ,28605 // iso-8859-15 Latin 9 (ISO)               ,1250 // windows-1250 Central European (Windows)                  ,1251 // windows-1251 Cyrillic (Windows)                         ,1252 // Windows-1252 Western European (Windows)                  ,1253 // windows-1253 Greek (Windows)                            ,1254 // windows-1254 Turkish (Windows)                          ,1255 // windows-1255 Hebrew (Windows)                           ,1256 // windows-1256 Arabic (Windows)                           ,1257 // windows-1257 Baltic (Windows)                           ,1258 // windows-1258 Vietnamese (Windows)            ,20866 // Cyrillic (KOI8-R)            ,21866 // Cyrillic (KOI8-U)              ,65000 // UTF-7            ,65001 // UTF-8            ,1200 // UTF-16            ,1201 // Unicode (Big-Endian)                ,12000 // UTF-32            ,12001 // UTF-32BE (UTF-32 Big-Endian)         };        for (int i = 0; i < encs.Length; ++i)        {            for (int j = 0; j < encs.Length; ++j)            {                System.Data.DataRow dr = dt.NewRow();                dr["SourceEncoding"] = encs[i];                dr["TargetEncoding"] = encs[j];                System.Text.Encoding enci = Encoding.GetEncoding(encs[i]);                System.Text.Encoding encj = Encoding.GetEncoding(encs[j]);                byte[] encoded = enci.GetBytes(broken);                string corrected = encj.GetString(encoded);                dr["Result"] = corrected;                dr["SourceEncodingName"] = enci.BodyName;                dr["TargetEncodingName"] = encj.BodyName;                if (StringComparer.InvariantCultureIgnoreCase.Equals(correct, corrected))                    dt.Rows.Add(dr);            }        }        this.dataGridView1.DataSource = dt;    }}

Or even more thorough, just test all encodings:

private void btnTestAll_Click(object sender, EventArgs e){    dt = new System.Data.DataTable();    string correct = "Brokers México, Intermediario de Aseguro,S.A.";    string broken = "Brokers México, Intermediario de Aseguro,S.A."; // Get text from database    dt.Columns.Add("SourceEncoding", typeof(string));    dt.Columns.Add("TargetEncoding", typeof(string));    dt.Columns.Add("Result", typeof(string));    dt.Columns.Add("SourceEncodingName", typeof(string));    dt.Columns.Add("TargetEncodingName", typeof(string));    System.Text.EncodingInfo[] encs = System.Text.Encoding.GetEncodings();    for (int i = 0; i < encs.Length; ++i)    {        for (int j = 0; j < encs.Length; ++j)        {            System.Data.DataRow dr = dt.NewRow();            dr["SourceEncoding"] = encs[i].CodePage;            dr["TargetEncoding"] = encs[j].CodePage;            System.Text.Encoding enci = System.Text.Encoding.GetEncoding(encs[i].CodePage);            System.Text.Encoding encj = System.Text.Encoding.GetEncoding(encs[j].CodePage);            byte[] encoded = enci.GetBytes(broken);            string corrected = encj.GetString(encoded);            dr["Result"] = corrected;            dr["SourceEncodingName"] = enci.BodyName;            dr["TargetEncodingName"] = encj.BodyName;            if (StringComparer.InvariantCultureIgnoreCase.Equals(correct, corrected))                dt.Rows.Add(dr);        }    }    this.dataGridView1.DataSource = dt;}

You can download the result here:

It's strange, it looks like you can get from German/ANSI (or ISO-8859-1) to ASCII, but there is NO WAY to convert it back (information loss)...

public static string lol(){    string source = "Alu-Dreieckstütze";    // System.Text.Encoding encSource = System.Text.Encoding.Default;    System.Text.Encoding encSource = System.Text.Encoding.GetEncoding(28591);    System.Text.Encoding encTarget = System.Text.Encoding.ASCII;    byte[] encoded = encSource.GetBytes(source);    string broken = encTarget.GetString(encoded);    return broken;}

The funny thing is, since the legacy app displays it correctly, it can't have lost the information.

So are you sure you haven't put a wrong (or no) encoding in the Sqlite connectionString ?

e.g.

  "Data Source=C:\\Users\\USERNAME\\Desktop\\location.db; Version=3; UseUTF16Encoding=True;Synchronous=Normal;New=False"; // set up the connection string

https://www.sqlite.org/c3ref/c_any.html

It looks like you can test the encoding with pragma encoding


2 steps:
First, you read the value from database as bytes array.
Second, you convert the bytes array with 1252 encoding into string.
Something like this:

byte[] buffer = dataReader["colomnName"];var encoding = Encoding.GetEncoding(28591);string s = encoding.GetString(buffer);


I also do get to import data from a source that encodes strings wrongly. But with the Microsoft.Data.SQLite library it's quite easy to inject a user defined function to fix the encoding. I am also using Dapper in that example:

using (var cnn = new SqliteConnection($"Data Source={databasePath}")) {    cnn.CreateFunction("fixencoding", (byte[] value) =>        Encoding.GetEncoding(1252).GetString(value), isDeterministic: true);    cnn.Open();    return cnn.Query<Board>(Properties.Resources.GetBoards);}

For this class:

public class Board{    public string Code { get; set; }    public string Description { get; set }    public decimal Length { get; set; }    public decimal Width { get; set; }    public decimal Thickness { get; set; }    public int Quantity { get; set; }}

and this query (Properties.Resources.GetBoards):

SELECT  fixencoding(CODE) AS Code,  fixencoding(DESC) AS Description,  LNGT AS Length,  WIDT AS Width,  THCK AS Thickness,  QNTY AS QuantityFROM  BOARDS

If the source uses the same system locale it's possible to use just Encoding.Default.GetString(value) instead of Encoding.GetEncoding(1252).GetString(value).