UTF-8 encoding with dplyr and SQLite
I had the same problem. I solved it like below. However, I do not guarantee that the solution is rock solid. Give it a try:
library(dplyr)library(sqldf)# Modifying built-in mtcars datasetmtcars$test <- c("č", "ž", "š", "č", "ž", "š", letters) %>% enc2utf8(.)mtcars$češćžä <- c("č", "ž", "š", "č", "ž", "š", letters) %>% enc2utf8(.)names(mtcars) <- iconv(names(mtcars), "cp1250", "utf-8")# Connecting to sqlite databasemy_db <- src_sqlite("my_db.sqlite3", create = T)# exporting mtcars dataset to databasecopy_to(my_db, mtcars, temporary = FALSE)# dbSendQuery(my_db$con, "drop table mtcars")# getting data from sqlite databasemy_mtcars_from_db <- collect(tbl(my_db, "mtcars"))# disconnecting from databasedbDisconnect(my_db$con)
convert_to_encoding() function
# a function that encodes # column names and values in character columns# with specified encodingsconvert_to_encoding <- function(x, from_encoding = "UTF-8", to_encoding = "cp1250"){ # names of columns are encoded in specified encoding my_names <- iconv(names(x), from_encoding, to_encoding) # if any column name is NA, leave the names # otherwise replace them with new names if(any(is.na(my_names))){ names(x) } else { names(x) <- my_names } # get column classes x_char_columns <- sapply(x, class) # identify character columns x_cols <- names(x_char_columns[x_char_columns == "character"]) # convert all string values in character columns to # specified encoding x <- x %>% mutate_each_(funs(iconv(., from_encoding, to_encoding)), x_cols) # return x return(x) }# useconvert_to_encoding(my_mtcars_from_db, "UTF-8", "cp1250")
Results
# before conversionmy_mtcars_from_dbSource: local data frame [32 x 13] mpg cyl disp hp drat wt qsec vs am gear carb ÄŤešćžä test1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 ÄŤ ÄŤ2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 Ĺľ Ĺľ3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 š š4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 ÄŤ ÄŤ5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 Ĺľ Ĺľ6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 š š7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 a a8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 b b9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 c c10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 d d.. ... ... ... ... ... ... ... .. .. ... ... ... ...# after conversionconvert_to_encoding(my_mtcars_from_db, "UTF-8", "cp1250")Source: local data frame [32 x 13] mpg cyl disp hp drat wt qsec vs am gear carb test češćžä1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 č č2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 ž ž3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 š š4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 č č5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 ž ž6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 š š7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 a a8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 b b9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 c c10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 d d.. ... ... ... ... ... ... ... .. .. ... ... ... ...
Session information
devtools::session_info()Session info ------------------------------------------------------------------- setting value version R version 3.2.0 (2015-04-16) system x86_64, mingw32 ui RStudio (0.99.441) language (EN) collate Slovenian_Slovenia.1250 tz Europe/Prague Packages ----------------------------------------------------------------------- package * version date source assertthat * 0.1 2013-12-06 CRAN (R 3.2.0) chron * 2.3-45 2014-02-11 CRAN (R 3.2.0) DBI 0.3.1 2014-09-24 CRAN (R 3.2.0) devtools * 1.7.0 2015-01-17 CRAN (R 3.2.0) dplyr 0.4.1 2015-01-14 CRAN (R 3.2.0) gsubfn 0.6-6 2014-08-27 CRAN (R 3.2.0) lazyeval * 0.1.10 2015-01-02 CRAN (R 3.2.0) magrittr * 1.5 2014-11-22 CRAN (R 3.2.0) proto 0.3-10 2012-12-22 CRAN (R 3.2.0) R6 * 2.0.1 2014-10-29 CRAN (R 3.2.0) Rcpp * 0.11.6 2015-05-01 CRAN (R 3.2.0) RSQLite 1.0.0 2014-10-25 CRAN (R 3.2.0) rstudioapi * 0.3.1 2015-04-07 CRAN (R 3.2.0) sqldf 0.4-10 2014-11-07 CRAN (R 3.2.0)