UTF-8 encoding with dplyr and SQLite UTF-8 encoding with dplyr and SQLite sqlite sqlite

UTF-8 encoding with dplyr and SQLite


I had the same problem. I solved it like below. However, I do not guarantee that the solution is rock solid. Give it a try:

library(dplyr)library(sqldf)# Modifying built-in mtcars datasetmtcars$test <-   c("č", "ž", "š", "č", "ž", "š", letters) %>%   enc2utf8(.)mtcars$češćžä <-   c("č", "ž", "š", "č", "ž", "š", letters) %>%   enc2utf8(.)names(mtcars) <-   iconv(names(mtcars), "cp1250", "utf-8")# Connecting to sqlite databasemy_db <- src_sqlite("my_db.sqlite3", create = T)# exporting mtcars dataset to databasecopy_to(my_db, mtcars, temporary = FALSE)# dbSendQuery(my_db$con, "drop table mtcars")# getting data from sqlite databasemy_mtcars_from_db <-  collect(tbl(my_db, "mtcars"))# disconnecting from databasedbDisconnect(my_db$con)

convert_to_encoding() function

# a function that encodes # column names and values in character columns# with specified encodingsconvert_to_encoding <-   function(x, from_encoding = "UTF-8", to_encoding = "cp1250"){    # names of columns are encoded in specified encoding    my_names <-       iconv(names(x), from_encoding, to_encoding)     # if any column name is NA, leave the names    # otherwise replace them with new names    if(any(is.na(my_names))){      names(x)    } else {      names(x) <- my_names    }    # get column classes    x_char_columns <- sapply(x, class)    # identify character columns    x_cols <- names(x_char_columns[x_char_columns == "character"])    # convert all string values in character columns to     # specified encoding    x <-       x %>%      mutate_each_(funs(iconv(., from_encoding, to_encoding)),                    x_cols)    # return x    return(x)  }# useconvert_to_encoding(my_mtcars_from_db, "UTF-8", "cp1250")

Results

# before conversionmy_mtcars_from_dbSource: local data frame [32 x 13]    mpg cyl  disp  hp drat    wt  qsec vs am gear carb ÄŤešćžä test1  21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4          ÄŤ   ÄŤ2  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4          Ĺľ   Ĺľ3  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1          š   š4  21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1          ÄŤ   ÄŤ5  18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2          Ĺľ   Ĺľ6  18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1          š   š7  14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4           a    a8  24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2           b    b9  22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2           c    c10 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4           d    d..  ... ...   ... ...  ...   ...   ... .. ..  ...  ...         ...  ...# after conversionconvert_to_encoding(my_mtcars_from_db, "UTF-8", "cp1250")Source: local data frame [32 x 13]    mpg cyl  disp  hp drat    wt  qsec vs am gear carb test češćžä1  21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4    č      č2  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4    ž      ž3  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1    š      š4  21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1    č      č5  18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2    ž      ž6  18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1    š      š7  14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4    a      a8  24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2    b      b9  22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2    c      c10 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4    d      d..  ... ...   ... ...  ...   ...   ... .. ..  ...  ...  ...    ...

Session information

devtools::session_info()Session info ------------------------------------------------------------------- setting  value                        version  R version 3.2.0 (2015-04-16) system   x86_64, mingw32              ui       RStudio (0.99.441)           language (EN)                         collate  Slovenian_Slovenia.1250      tz       Europe/Prague               Packages ----------------------------------------------------------------------- package    * version date       source         assertthat * 0.1     2013-12-06 CRAN (R 3.2.0) chron      * 2.3-45  2014-02-11 CRAN (R 3.2.0) DBI          0.3.1   2014-09-24 CRAN (R 3.2.0) devtools   * 1.7.0   2015-01-17 CRAN (R 3.2.0) dplyr        0.4.1   2015-01-14 CRAN (R 3.2.0) gsubfn       0.6-6   2014-08-27 CRAN (R 3.2.0) lazyeval   * 0.1.10  2015-01-02 CRAN (R 3.2.0) magrittr   * 1.5     2014-11-22 CRAN (R 3.2.0) proto        0.3-10  2012-12-22 CRAN (R 3.2.0) R6         * 2.0.1   2014-10-29 CRAN (R 3.2.0) Rcpp       * 0.11.6  2015-05-01 CRAN (R 3.2.0) RSQLite      1.0.0   2014-10-25 CRAN (R 3.2.0) rstudioapi * 0.3.1   2015-04-07 CRAN (R 3.2.0) sqldf        0.4-10  2014-11-07 CRAN (R 3.2.0)