Data tweaks (#94)

jonthegeek · web-flow · commit eb5d52ad0b56 · 2025-06-04T12:27:52.000Z
* Fix typo in gutenberg_languages parser

* Add better checks for data.
diff --git a/NEWS.md b/NEWS.md
@@ -35,7 +35,7 @@
 
 * Make compatible with tidyr v1.0.0
 * data_frame is deprecated, use tibble (thanks @evanodell for #21)
-* ROpenSci updates to README (thanks @maelle for #23)
+* rOpenSci updates to README (thanks @maelle for #23)
 
 # gutenbergr 0.1.4
 
@@ -60,7 +60,7 @@
 * Added `all_languages` and `only_languages` arguments to `gutenberg_works`, allowing fine-grained control of languages. (For example, "either English or French" or "both English and French")
 * Changed get_gutenberg_mirror to use xml2 directly, in order to handle AppVeyor
 * Removed use of data() in `gutenberg_works`, since it slows down `gutenberg_works` about 2X
-* Various documentation, vignette, and README adjustments in response to ROpenSci feedback.
+* Various documentation, vignette, and README adjustments in response to rOpenSci feedback.
 * Added AppVeyor for Windows continuous integration
 * Added code coverage information through codecov.io and covr, along with tests to improve coverage
 
diff --git a/R/data.R b/R/data.R
@@ -13,7 +13,7 @@
 #'   \item{title}{Title}
 #'   \item{author}{Author, if a single one given. Given as last name
 #'   first (e.g. "Doyle, Arthur Conan")}
-#'   \item{author_id}{Project Gutenberg author ID}
+#'   \item{gutenberg_author_id}{Project Gutenberg author ID}
 #'   \item{language}{Language ISO 639 code, separated by / if multiple. Two
 #'   letter code if one exists, otherwise three letter. See
 #'   \url{https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes}}
diff --git a/data-raw/parse_rdfs.R b/data-raw/parse_rdfs.R
@@ -32,7 +32,7 @@ new_gutenberg_languages <- purrr::map(all_metadata, ~ .x$languages) |>
   purrr::list_rbind() |>
   dplyr::distinct() |>
   dplyr::arrange(gutenberg_id, language) |>
-  dplyr::mutate(lanuage = as.factor(language))
+  dplyr::mutate(language = as.factor(language))
 
 new_gutenberg_metadata <- purrr::map(all_metadata, ~ .x$metadata) |>
   purrr::list_rbind() |>
diff --git a/data/gutenberg_languages.rda b/data/gutenberg_languages.rda
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -0,0 +1,35 @@
+AppVeyor
+CMD
+Codecov
+ORCID
+README
+WikipediR
+Wuthering
+audiobooks
+codecov
+covr
+df
+dplyr
+gutenberg
+http
+https
+humaniformat
+io
+lcc
+lcsh
+magrittr
+pageview
+pkgdown
+pre
+rOpenSci
+ropensci
+ropenscilabs
+roxygen
+rvest
+tbl
+tibble
+tidyr
+tidytext
+tm
+tokenization
+wikipediatrend
diff --git a/man/gutenberg_metadata.Rd b/man/gutenberg_metadata.Rd
diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R
@@ -9,3 +9,56 @@ test_that("All four datasets have a date-updated", {
   expect_s3_class(d3, "Date")
   expect_s3_class(d4, "Date")
 })
+
+test_that("gutenberg_metadata has the expected shape", {
+  expect_s3_class(gutenberg_metadata, c("tbl_df", "tbl", "data.frame"))
+  expect_named(
+    gutenberg_metadata,
+    c(
+      "gutenberg_id",
+      "title",
+      "author",
+      "gutenberg_author_id",
+      "language",
+      "gutenberg_bookshelf",
+      "rights",
+      "has_text"
+    )
+  )
+  expect_gte(nrow(gutenberg_metadata), 79491)
+})
+
+test_that("gutenberg_subjects has the expected shape", {
+  expect_s3_class(gutenberg_subjects, c("tbl_df", "tbl", "data.frame"))
+  expect_named(
+    gutenberg_subjects,
+    c("gutenberg_id", "subject_type", "subject")
+  )
+  expect_gte(nrow(gutenberg_subjects), 255000)
+})
+
+test_that("gutenberg_authors has the expected shape", {
+  expect_s3_class(gutenberg_authors, c("tbl_df", "tbl", "data.frame"))
+  expect_named(
+    gutenberg_authors,
+    c(
+      "gutenberg_author_id",
+      "author",
+      "alias",
+      "birthdate",
+      "deathdate",
+      "wikipedia",
+      "aliases"
+    )
+  )
+  expect_gte(nrow(gutenberg_authors), 26000)
+})
+
+test_that("gutenberg_languages has the expected shape", {
+  expect_s3_class(gutenberg_languages, c("tbl_df", "tbl", "data.frame"))
+  expect_named(
+    gutenberg_languages,
+    c("gutenberg_id", "language", "total_languages")
+  )
+  expect_gte(nrow(gutenberg_languages), 76000)
+})