Skip to content

Commit d1fb71b

Browse files
fix encoding issues regarding hex-codes
1 parent acb36c7 commit d1fb71b

1 file changed

Lines changed: 44 additions & 2 deletions

File tree

Script/clean-final-data.R

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# read data
2-
editors <- read.csv("Output\\editors.csv", header = TRUE)
2+
editors <- read.csv("Output\\editors.csv", header = TRUE, fileEncoding = "UTF-8")
33

44
# remove automatically created numeric columns
55
editors$X <- NULL
@@ -16,5 +16,47 @@ editors <- editors[!is.na(editors$publisher), ]
1616
nas_df <- editors[which(is.na(editors$editor) & is.na(editors$affiliation)), ] # 0
1717
editors <- editors[!with(editors, is.na(editor) & is.na(affiliation)), ]
1818

19+
# clean encoding (fix wrongful hex-codes)
20+
ascii <- structure(list(Hex = c("<a0>", "<a1>", "<a2>", "<a3>", "<a4>",
21+
"<a5>", "<a6>", "<a7>", "<a8>", "<a9>", "<aa>", "<ab>", "<ac>",
22+
"<ad>", "<ae>", "<af>", "<b0>", "<b1>", "<b2>", "<b3>", "<b4>",
23+
"<b5>", "<b6>", "<b7>", "<b8>", "<b9>", "<ba>", "<bb>", "<bc>",
24+
"<bd>", "<be>", "<bf>", "<c0>", "<c1>", "<c2>", "<c3>", "<c4>",
25+
"<c5>", "<c6>", "<c7>", "<c8>", "<c9>", "<ca>", "<cb>", "<cc>",
26+
"<cd>", "<ce>", "<cf>", "<d0>", "<d1>", "<d2>", "<d3>", "<d4>",
27+
"<d5>", "<d6>", "<d7>", "<d8>", "<d9>", "<da>", "<db>", "<dc>",
28+
"<dd>", "<de>", "<df>", "<e0>", "<e1>", "<e2>", "<e3>", "<e4>",
29+
"<e5>", "<e6>", "<e7>", "<e8>", "<e9>", "<ea>", "<eb>", "<ec>",
30+
"<ed>", "<ee>", "<ef>", "<f0>", "<f1>", "<f2>", "<f3>", "<f4>",
31+
"<f5>", "<f6>", "<f7>", "<f8>", "<f9>", "<fa>", "<fb>", "<fc>",
32+
"<fd>", "<fe>", "<ff>"), Actual = c(" ", "¡", "¢", "£", "¤",
33+
"¥", "¦", "§", "¨", "©", "ª", "«", "¬", "SHY", "®", "¯", "°",
34+
"±", "²", "³", "´", "µ", "", "·", "¸", "¹", "º", "»", "¼", "½",
35+
"¾", "¿", "À", "Á", "Â", "Ã", "Ä", "Å", "Æ", "Ç", "È", "É", "Ê",
36+
"Ë", "Ì", "Í", "Î", "Ï", "Ð", "Ñ", "Ò", "Ó", "Ô", "Õ", "Ö", "×",
37+
"Ø", "Ù", "Ú", "Û", "Ü", "Ý", "Þ", "ß", "à", "á", "â", "ã", "ä",
38+
"å", "æ", "ç", "è", "é", "ê", "ë", "ì", "í", "î", "ï", "ð", "ñ",
39+
"ò", "ó", "ô", "õ", "ö", "÷", "ø", "ù", "ú", "û", "ü", "ý", "þ",
40+
"ÿ")), row.names = c(NA, -96L), class = "data.frame")
41+
42+
editors$editor <- stringi::stri_replace_all_fixed(
43+
editors$editor,
44+
ascii$Hex,
45+
ascii$Actual,
46+
vectorize_all = FALSE
47+
)
48+
editors$affiliation <- stringi::stri_replace_all_fixed(
49+
editors$affiliation,
50+
ascii$Hex,
51+
ascii$Actual,
52+
vectorize_all = FALSE
53+
)
54+
editors$journal <- stringi::stri_replace_all_fixed(
55+
editors$journal,
56+
ascii$Hex,
57+
ascii$Actual,
58+
vectorize_all = FALSE
59+
)
60+
1961
# save the cleaned data
20-
write.csv(editors, "Output\\editors.csv")
62+
write.csv(editors, "Output\\editors.csv", fileEncoding = "UTF-8")

0 commit comments

Comments
 (0)