Downloading Shakespeare Text

The following downloads a zipped folder containing all of the text of 37 of Shakespeare’s plays (originally available at lexically.net).

td <- tempdir()
tf <- tempfile(tmpdir = td)
download.file("https://github.com/clarkdatalabs/soms/raw/master/ShakespearePlaysPlus.zip", 
    tf)

fname <- unzip(tf, list = TRUE)$Name[1]
unzip(tf, exdir = td, overwrite = TRUE)
fpath <- file.path(td, fname)
remove(fname)
unlink(tf)
genres <- list.dirs(fpath, full.names = FALSE, recursive = FALSE)

Make play_table

library("readr")
play_table <- data.frame(play = character(), genre = character(), text = character())

for (genre in genres) {
    plays <- list.files(file.path(fpath, genre), pattern = ".txt")
    for (play in plays) {
        play.text <- read_file(file.path(fpath, genre, play), locale(encoding = "UTF-16"))
        play.name <- gsub(".txt", "", play)
        play_table <- rbind(play_table, data.frame(play = play.name, genre = genre, 
            text = play.text))
    }
}
remove(genre, plays, play, play.name, play.text)
write.csv(play_table, file = "play_table.csv", row.names = FALSE)

Make character_table

library("readr")
library("stringi")

character_table <- data.frame(char = character(), play = character(), genre = character(), 
    text = character())

for (genre in genres) {
    plays <- list.dirs(file.path(fpath, genre), full.names = FALSE, recursive = FALSE)
    for (play in plays) {
        play.name <- gsub("_character", "", play)
        chars <- list.files(file.path(fpath, genre, play), pattern = ".txt")
        for (char in chars) {
            text <- read_file(file.path(fpath, genre, play, char), locale(encoding = "UTF-16"))
            char.name <- stri_trans_totitle(gsub(".txt", "", char))
            character_table <- rbind(character_table, data.frame(char = char.name, 
                play = play.name, genre = genre, text = text))
        }
    }
}
remove(genre, plays, play, play.name, chars, char, char.name, text)
write.csv(character_table, file = "character_table.csv", row.names = FALSE)

Clean up Workspace

unlink(td, recursive = FALSE)
remove(fpath, genres, td, tf)