Show the code
# Loading necessary libraries ####
library(tidyverse)
library(here)
library(janitor)
library(readxl)
library(writexl)
library(shiny)
library(DT)
library(flextable)
# Loading datasets ####
<- "Kanji_20240227_081842.csv"
file_kanji <- "Jukugo_20240227_081908.csv"
file_jukugo <- "Optimized Kore - Sheet1.csv"
file_words <- "Sentences.xlsx"
file_sentences1 <- "Sentences_core.xlsx"
file_sentences2
## Kanji list ####
<- read.csv2(here("data/kanji", file_kanji)) %>%
data_kanji clean_names()
## Jukugo list ####
<- read.csv2(here("data/kanji", file_jukugo)) %>%
data_jukugo clean_names() %>%
arrange(desc(frequency)) %>%
select(comp_word, pronunciation, english_translation) %>%
rename(word = comp_word,
meaning = english_translation) %>%
distinct(word, .keep_all = TRUE) %>%
# Change romaji
mutate(
pronunciation = gsub("zi", "ji", pronunciation),
pronunciation = gsub("zy", "jy", pronunciation),
pronunciation = gsub("ti", "chi", pronunciation),
pronunciation = gsub("ty", "ch", pronunciation),
pronunciation = gsub("si", "shi", pronunciation),
pronunciation = gsub("sy", "shy", pronunciation),
pronunciation = gsub("tu", "tsu", pronunciation),
pronunciation = gsub("hu", "fu", pronunciation)
)
## word list ####
<- read.csv(here("data/kanji", file_words)) %>%
data_words clean_names() %>%
arrange(core_index) %>%
select(vocab_expression, vocab_kana, vocab_meaning) %>%
rename(word = vocab_expression,
pronunciation = vocab_kana,
meaning = vocab_meaning) %>%
distinct(word, .keep_all = TRUE)
## Number list ####
<- data.frame(
data_numbers word = c("零", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十",
"十一", "十二", "十三", "十四", "十五", "十六", "十七", "十八", "十九",
"二十", "二十一", "二十二", "二十三", "二十四", "二十五", "二十六", "二十七",
"二十八", "二十九", "三十", "三十一", "三十二", "三十三", "三十四", "三十五",
"三十六", "三十七", "三十八", "三十九", "四十", "四十一", "四十二", "四十三",
"四十四", "四十五", "四十六", "四十七", "四十八", "四十九", "五十", "五十一",
"五十二", "五十三", "五十四", "五十五", "五十六", "五十七", "五十八", "五十九",
"六十", "六十一", "六十二", "六十三", "六十四", "六十五", "六十六", "六十七",
"六十八", "六十九", "七十", "七十一", "七十二", "七十三", "七十四", "七十五",
"七十六", "七十七", "七十八", "七十九", "八十", "八十一", "八十二", "八十三",
"八十四", "八十五", "八十六", "八十七", "八十八", "八十九", "九十", "九十一",
"九十二", "九十三", "九十四", "九十五", "九十六", "九十七", "九十八", "九十九", "百"),
pronunciation = c("れい", "いち", "に", "さん", "し", "ご", "ろく", "しち", "はち", "きゅう", "じゅう",
"じゅういち", "じゅうに", "じゅうさん", "じゅうし", "じゅうご", "じゅうろく", "じゅうしち", "じゅうはち", "じゅうきゅう",
"にじゅう", "にじゅういち", "にじゅうに", "にじゅうさん", "にじゅうし", "にじゅうご", "にじゅうろく", "にじゅうしち", "にじゅうはち", "にじゅうきゅう",
"さんじゅう", "さんじゅういち", "さんじゅうに", "さんじゅうさん", "さんじゅうし", "さんじゅうご", "さんじゅうろく", "さんじゅうしち", "さんじゅうはち", "さんじゅうきゅう",
"よんじゅう", "よんじゅういち", "よんじゅうに", "よんじゅうさん", "よんじゅうし", "よんじゅうご", "よんじゅうろく", "よんじゅうしち", "よんじゅうはち", "よんじゅうきゅう",
"ごじゅう", "ごじゅういち", "ごじゅうに", "ごじゅうさん", "ごじゅうし", "ごじゅうご", "ごじゅうろく", "ごじゅうしち", "ごじゅうはち", "ごじゅうきゅう",
"ろくじゅう", "ろくじゅういち", "ろくじゅうに", "ろくじゅうさん", "ろくじゅうし", "ろくじゅうご", "ろくじゅうろく", "ろくじゅうしち", "ろくじゅうはち", "ろくじゅうきゅう",
"しちじゅう", "しちじゅういち", "しちじゅうに", "しちじゅうさん", "しちじゅうし", "しちじゅうご", "しちじゅうろく", "しちじゅうしち", "しちじゅうはち", "しちじゅうきゅう",
"はちじゅう", "はちじゅういち", "はちじゅうに", "はちじゅうさん", "はちじゅうし", "はちじゅうご", "はちじゅうろく", "はちじゅうしち", "はちじゅうはち", "はちじゅうきゅう",
"きゅうじゅう", "きゅうじゅういち", "きゅうじゅうに", "きゅうじゅうさん", "きゅうじゅうし", "きゅうじゅうご", "きゅうじゅうろく", "きゅうじゅうしち", "きゅうじゅうはち", "きゅうじゅうきゅう",
"ひゃく"),
meaning = as.character(0:100)
)
# Sentence list ####
<- read_excel(here("data/kanji", file_sentences1))
data_sentences1 <- read_excel(here("data/kanji", file_sentences2))
data_sentences2
<- rbind(data_sentences1, data_sentences2)
data_sentences rm(data_sentences1, data_sentences2)
# Extract kana ####
<- intToUtf8(seq(12353, 12438)) # Unicode range for hiragana characters
hiragana_chars <- intToUtf8(seq(12448, 12543)) # Unicode range for katakana characters
katakana_chars
<- paste0(hiragana_chars, katakana_chars) %>%
kana str_split_1(pattern = "") %>%
paste(collapse = "|")
# All data ####
<- data_words %>%
data_bind rbind(data_sentences %>% select(word, pronunciation, meaning),
data_numbers,%>%
data_jukugo) mutate(word = str_remove_all(word, "[ a-zA-Z]")) %>%
distinct(word, .keep_all = TRUE)
# Extract kanji ####
<- data_kanji %>%
all_kanji select(kanji) %>%
rbind(data_bind %>%
mutate(kanji = word, .keep = "none")) %>%
mutate(kanji = str_remove_all(kanji, kana)) %>%
filter(kanji != "") %>%
pull() %>%
paste(collapse = "") %>%
str_split_1(pattern = "") %>%
unique()
## Separate individual kanji from all words ####
<- data_bind %>%
data_all mutate(kanji = str_remove_all(word, kana)) %>%
separate_wider_position(kanji, widths = c("kanji_1" = 1,
"kanji_2" = 2,
"kanji_3" = 3,
"kanji_4" = 4,
"kanji_5" = 5),
too_few = "align_start") %>%
filter(!is.na(kanji_1))
flextable(head(data_all))
word | pronunciation | meaning | kanji_1 | kanji_2 | kanji_3 | kanji_4 | kanji_5 |
---|---|---|---|---|---|---|---|
見る | みる | see, look at | 見 | ||||
円 | えん | circle | 円 | ||||
多い | おおい | lots of | 多 | ||||
家 | うち | house, home | 家 | ||||
新しい | あたらしい | new | 新 | ||||
私 | わたし | I | 私 |