In my last post, I showed how to read a fixed width file into R. In this post, I'll refactor that code into a function that takes year/grade as a parameter.
First, read in the layout data frame. This contains the column headers for NJASK fixed with files.
load(file = 'datasets/njask_layout.rda')
#also read in the old NJASK layouts
load(file = 'datasets/njask05_layout.rda')
load(file = 'datasets/njask04_layout.rda')
load(file = 'datasets/njask07gr3_layout.rda')
load(file = 'datasets/njask06gr3_layout.rda')
load(file = 'datasets/njask06gr5_layout.rda')
head(layout_njask)
## field_start_position field_end_position field_length data_type
## 1 1 9 9 Text
## 2 1 2 2 Text
## 3 3 6 4 Text
## 4 7 9 3 Text
## 5 10 59 50 Text
## 6 60 109 50 Text
## description
## 1 CDS Code
## 2 County Code/DFG/Aggregation Code
## 3 District Code
## 4 School Code
## 5 County Name
## 6 District Name
## comments
## 1 For district and school records the first three fields combine to form the CDS code.
## 2 For district and school records this field contains a two-digit county code. For DFG records, this field contains the DFG code. For Statewide, Special Needs and Non-Special Needs records, it contains a two-letter abbreviation.
## 3 Applicable only for district and school records.
## 4 Applicable only for district and school records.
## 5 Applicable only for district and school records.
## 6 Applicable only for district and school records.
## valid_values
## 1 <NA>
## 2 County Code = 01, 03, 05, 07, 09, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 80\nDFG = A, B, CD, DE, FG, GH, I, J, R, V\nST = Statewide, NS = Non-Special Needs, SN = Special Needs
## 3 0100 to 9999, blank
## 4 001 to 999, blank
## 5 A to Z, blank
## 6 A to Z, blank
## spanner1 spanner2 final_name
## 1 CDS_Code
## 2 County_Code/DFG/Aggregation_Code
## 3 District_Code
## 4 School_Code
## 5 County_Name
## 6 District_Name
Our final call to read_fwf
in the last post looked like this:
sample_file = "http://www.state.nj.us/education/schools/achievement/14/njask8/state_summary.txt"
njask14_gr8 <- readr::read_fwf(
file = sample_file,
col_positions = readr::fwf_positions(
start = layout_njask$field_start_position,
end = layout_njask$field_end_position,
col_names = layout_njask$final_name
),
na = "*"
)
## Parsed with column specification:
## cols(
## .default = col_character(),
## TOTAL_POPULATION_Number_Enrolled_ELA = col_integer(),
## TOTAL_POPULATION_LANGUAGE_ARTS_Number_Not_Present = col_integer(),
## TOTAL_POPULATION_LANGUAGE_ARTS_Scale_Score_Mean = col_integer(),
## TOTAL_POPULATION_MATHEMATICS_Number_Enrolled_Math = col_integer(),
## TOTAL_POPULATION_MATHEMATICS_Number_Not_Present = col_integer(),
## TOTAL_POPULATION_MATHEMATICS_Scale_Score_Mean = col_integer(),
## TOTAL_POPULATION_SCIENCE_Number_Enrolled_Science = col_integer(),
## TOTAL_POPULATION_SCIENCE_Number_Not_Present = col_integer(),
## TOTAL_POPULATION_SCIENCE_Advanced_Proficient_Percentage = col_integer(),
## TOTAL_POPULATION_SCIENCE_Scale_Score_Mean = col_integer(),
## GENERAL_EDUCATION_Number_Enrolled_ELA = col_integer(),
## GENERAL_EDUCATION_LANGUAGE_ARTS_Number_Not_Present = col_integer(),
## GENERAL_EDUCATION_LANGUAGE_ARTS_Advanced_Proficient_Percentage = col_integer(),
## GENERAL_EDUCATION_LANGUAGE_ARTS_Scale_Score_Mean = col_integer(),
## GENERAL_EDUCATION_MATHEMATICS_Number_Enrolled_Math = col_integer(),
## GENERAL_EDUCATION_MATHEMATICS_Number_Not_Present = col_integer(),
## GENERAL_EDUCATION_MATHEMATICS_Advanced_Proficient_Percentage = col_integer(),
## GENERAL_EDUCATION_MATHEMATICS_Scale_Score_Mean = col_integer(),
## GENERAL_EDUCATION_SCIENCE_Number_Enrolled_Science = col_integer(),
## GENERAL_EDUCATION_SCIENCE_Number_Not_Present = col_integer()
## # ... with 240 more columns
## )
## See spec(...) for full column specifications.
## Warning: 2578 parsing failures.
## row col expected actual
## 1 SPECIAL_EDUCATION_WITH_ACCOMMODATIONS_SCIENCE_Scale_Score_Mean 4 chars 0
## 1 NA 551 columns 549 columns
## 2 SPECIAL_EDUCATION_WITH_ACCOMMODATIONS_SCIENCE_Scale_Score_Mean 4 chars 0
## 2 NA 551 columns 549 columns
## 3 SPECIAL_EDUCATION_WITH_ACCOMMODATIONS_SCIENCE_Scale_Score_Mean 4 chars 2
## ... .............................................................. ........... ...........
## See problems(...) for more details.
A script that has a bunch of copy/paste versions of that call would probably get the job done, but we're writing for our future selves here, and those url paths are easy to build. The function should:
1) build the target url
2) use hadley's readr
to fetch the fixed with file, using the provided field definitions.
get_raw_njask <- function(year, grade, layout=layout_njask) {
require(readr)
#url paths changed after the 2012 assessment
years <- list(
"2014"="14", "2013"="13", "2012"="2013", "2011"="2012", "2010"="2011", "2009"="2010",
"2008"="2009", "2007"="2008", "2006"="2007", "2005"="2006", "2004"="2005"
)
parsed_year <- years[[as.character(year)]]
#2008 follows a totally unique pattern
grade_str <- if (year==2008 & grade >=5) {
paste0('58/g', grade)
} else if (year %in% c(2006, 2007) & grade %in% c(5, 6, 7)) {
'57'
} else {
grade
}
#filenames are also inconsistent
filename <- list(
"2014"="state_summary.txt", "2013"="state_summary.txt", "2012"="state_summary.txt",
"2011"="state_summary.txt", "2010"="state_summary.txt", "2009"="state_summary.txt",
"2008"="state_summary.txt", "2007"=if(grade >= 5) {
paste0('G', grade, 'state_summary.txt')
} else {
"state_summary.txt"
},
"2006"=if(grade >= 5) {
paste0('G', grade, 'state_summary.txt')
} else {
"state_summary.txt"
},
"2005"= if(grade==3) {
"njask005_state_summary3.txt"
} else if (grade==4) {
"njask2005_state_summary4.txt"
},
"2004"=paste0("njask", grade, "04state_summary.txt")
)
parsed_filename <- filename[[as.character(year)]]
#build url
target_url <- paste0(
"http://www.state.nj.us/education/schools/achievement/", parsed_year,
"/njask", grade_str, "/", parsed_filename
)
#read_fwf
df <- readr::read_fwf(
file = target_url,
col_positions = readr::fwf_positions(
start = layout$field_start_position,
end = layout$field_end_position,
col_names = layout$final_name
),
na = "*"
)
#return df
return(df)
}
Let's give it a try!
library(magrittr)
ex <- get_raw_njask(2014, 6)
## Parsed with column specification:
## cols(
## .default = col_character(),
## TOTAL_POPULATION_Number_Enrolled_ELA = col_integer(),
## TOTAL_POPULATION_LANGUAGE_ARTS_Scale_Score_Mean = col_integer(),
## TOTAL_POPULATION_MATHEMATICS_Number_Enrolled_Math = col_integer(),
## TOTAL_POPULATION_SCIENCE_Advanced_Proficient_Percentage = col_integer(),
## TOTAL_POPULATION_SCIENCE_Scale_Score_Mean = col_integer(),
## GENERAL_EDUCATION_Number_Enrolled_ELA = col_integer(),
## GENERAL_EDUCATION_LANGUAGE_ARTS_Number_Not_Present = col_integer(),
## GENERAL_EDUCATION_LANGUAGE_ARTS_Advanced_Proficient_Percentage = col_integer(),
## GENERAL_EDUCATION_LANGUAGE_ARTS_Scale_Score_Mean = col_integer(),
## GENERAL_EDUCATION_MATHEMATICS_Number_Enrolled_Math = col_integer(),
## GENERAL_EDUCATION_MATHEMATICS_Number_Not_Present = col_integer(),
## GENERAL_EDUCATION_SCIENCE_Advanced_Proficient_Percentage = col_integer(),
## GENERAL_EDUCATION_SCIENCE_Scale_Score_Mean = col_integer(),
## SPECIAL_EDUCATION_Number_Enrolled_ELA = col_integer(),
## SPECIAL_EDUCATION_LANGUAGE_ARTS_Advanced_Proficient_Percentage = col_integer(),
## SPECIAL_EDUCATION_LANGUAGE_ARTS_Scale_Score_Mean = col_integer(),
## SPECIAL_EDUCATION_MATHEMATICS_Number_Enrolled_Math = col_integer(),
## SPECIAL_EDUCATION_SCIENCE_Advanced_Proficient_Percentage = col_integer(),
## SPECIAL_EDUCATION_SCIENCE_Scale_Score_Mean = col_integer(),
## LIMITED_ENGLISH_PROFICIENT_current_and_former_Number_Enrolled_ELA = col_integer()
## # ... with 147 more columns
## )
## See spec(...) for full column specifications.
## Warning: 3010 parsing failures.
## row col expected actual
## 1 SPECIAL_EDUCATION_WITH_ACCOMMODATIONS_SCIENCE_Scale_Score_Mean 4 chars 0
## 1 NA 551 columns 549 columns
## 2 SPECIAL_EDUCATION_WITH_ACCOMMODATIONS_SCIENCE_Scale_Score_Mean 4 chars 0
## 2 NA 551 columns 549 columns
## 3 SPECIAL_EDUCATION_WITH_ACCOMMODATIONS_SCIENCE_Scale_Score_Mean 4 chars 2
## ... .............................................................. ........... ...........
## See problems(...) for more details.
dplyr::sample_n(ex[, sample(c(1:551), 10)], 10) %>% as.data.frame()
## SPECIAL_EDUCATION_WITH_ACCOMMODATIONS_SCIENCE_Advanced_Proficient_Percentage
## 1 0146
## 2 0146
## 3 0146
## 4 0146
## 5 0146
## 6 0146
## 7 0146
## 8 0146
## 9 0146
## 10 0146
## MIGRANT_SCIENCE_Number_Not_Present
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## BLACK_SCIENCE_Number_Enrolled_Science
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## CURRENT_LIMITED_ENGLISH_PROFICIENT_SCIENCE_Number_APA
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## MIGRANT_MATHEMATICS_Proficient_Percentage
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## FORMER_LIMITED_ENGLISH_PROFICIENT_MATHEMATICS_Number_of_Voids
## 1
## 2
## 3
## 4 <NA>
## 5
## 6
## 7 <NA>
## 8 <NA>
## 9 <NA>
## 10
## SPECIAL_EDUCATION_WITH_ACCOMMODATIONS_SCIENCE_Partially_Proficient_Percentage
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## CURRENT_LIMITED_ENGLISH_PROFICIENT_LANGUAGE_ARTS_Number_of_Valid_Scale_Scores
## 1 * *
## 2 * *
## 3 * *
## 4 * *
## 5
## 6
## 7
## 8 * *
## 9 * *
## 10
## NON-MIGRANT_MATHEMATICS_Number_APA
## 1 <NA>
## 2 13 15
## 3 31 32
## 4 141 7
## 5 31 9
## 6 <NA>
## 7 <NA>
## 8 58 32
## 9 264 7
## 10 155 9
## SPECIAL_EDUCATION_WITH_ACCOMMODATIONS_MATHEMATICS_Partially_Proficient_Percentage
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
Great - we can definitely pull down all the data files we need. Now let's make sure that we are doing the necessary cleanup. In particular, there are 228 columns in our layout file that have the comment 'One implied decimal.' That really matters - it means that 255
actually means 25.5.
For each of those columns, we want to apply a function to each column. Hadley's dplyr
has a nice utility for doing this - mutate_each
.
We'll also need to make sure that we only mutate the columns that are tagged with 'One implied decimal.'
library(dplyr)
process_njask <- function(df, mask=layout_njask$comments) {
#keep the names to put back in the same order
all_names <- names(df)
#replace any line breaks in last column
df$Grade <- gsub('\n', '', df$Grade, fixed = TRUE)
mask_boolean <- mask == 'One implied decimal'
#put some columns aside
ignore <- df[, !mask_boolean]
#process the columns that have an implied decimal
processed <- df[, mask_boolean] %>%
dplyr::mutate_each(
dplyr::funs(implied_decimal = . / 10)
)
#put back together
final <- cbind(ignore, processed)
#reorder and return
final %>%
select(
one_of(names(df))
)
}
ex_process <- process_njask(ex)
## Error: NA column indexes not supported
head(ex_process[, 1:15])
## Error in head(ex_process[, 1:15]): object 'ex_process' not found
Now we put it all together, with a little wrapper function around the get_raw_njask
and process_njask
functions:
fetch_njask <- function(year, grade) {
if (year == 2004) {
df <- get_raw_njask(year, grade, layout=layout_njask04) %>%
process_njask(mask=layout_njask04$comments)
}
else if (year == 2005) {
df <- get_raw_njask(year, grade, layout=layout_njask05) %>%
process_njask(mask=layout_njask05$comments)
} else if (year %in% c(2007, 2008) & grade %in% c(3, 4)) {
df <- get_raw_njask(year, grade, layout=layout_njask07gr3) %>%
process_njask(mask=layout_njask07gr3$comments)
}
else if (year == 2006 & grade %in% c(3, 4)) {
df <- get_raw_njask(year, grade, layout=layout_njask06gr3) %>%
process_njask(mask=layout_njask06gr3$comments)
} else if (year == 2006 & grade >= 5) {
#fetch
df <- get_raw_njask(year, grade, layout=layout_njask06gr5)
#inexplicably 2006 data has no Grade column
df$Grade <- grade
df <- df %>% process_njask(mask=layout_njask06gr5$comments)
}
else {
df <- get_raw_njask(year, grade) %>% process_njask()
}
return(df)
}
fetch_njask(2014, 6) %>% head() %>% select(CDS_Code:TOTAL_POPULATION_LANGUAGE_ARTS_Scale_Score_Mean)
## Parsed with column specification:
## cols(
## .default = col_character(),
## TOTAL_POPULATION_Number_Enrolled_ELA = col_integer(),
## TOTAL_POPULATION_LANGUAGE_ARTS_Scale_Score_Mean = col_integer(),
## TOTAL_POPULATION_MATHEMATICS_Number_Enrolled_Math = col_integer(),
## TOTAL_POPULATION_SCIENCE_Advanced_Proficient_Percentage = col_integer(),
## TOTAL_POPULATION_SCIENCE_Scale_Score_Mean = col_integer(),
## GENERAL_EDUCATION_Number_Enrolled_ELA = col_integer(),
## GENERAL_EDUCATION_LANGUAGE_ARTS_Number_Not_Present = col_integer(),
## GENERAL_EDUCATION_LANGUAGE_ARTS_Advanced_Proficient_Percentage = col_integer(),
## GENERAL_EDUCATION_LANGUAGE_ARTS_Scale_Score_Mean = col_integer(),
## GENERAL_EDUCATION_MATHEMATICS_Number_Enrolled_Math = col_integer(),
## GENERAL_EDUCATION_MATHEMATICS_Number_Not_Present = col_integer(),
## GENERAL_EDUCATION_SCIENCE_Advanced_Proficient_Percentage = col_integer(),
## GENERAL_EDUCATION_SCIENCE_Scale_Score_Mean = col_integer(),
## SPECIAL_EDUCATION_Number_Enrolled_ELA = col_integer(),
## SPECIAL_EDUCATION_LANGUAGE_ARTS_Advanced_Proficient_Percentage = col_integer(),
## SPECIAL_EDUCATION_LANGUAGE_ARTS_Scale_Score_Mean = col_integer(),
## SPECIAL_EDUCATION_MATHEMATICS_Number_Enrolled_Math = col_integer(),
## SPECIAL_EDUCATION_SCIENCE_Advanced_Proficient_Percentage = col_integer(),
## SPECIAL_EDUCATION_SCIENCE_Scale_Score_Mean = col_integer(),
## LIMITED_ENGLISH_PROFICIENT_current_and_former_Number_Enrolled_ELA = col_integer()
## # ... with 147 more columns
## )
## See spec(...) for full column specifications.
## Warning: 3010 parsing failures.
## row col expected actual
## 1 SPECIAL_EDUCATION_WITH_ACCOMMODATIONS_SCIENCE_Scale_Score_Mean 4 chars 0
## 1 NA 551 columns 549 columns
## 2 SPECIAL_EDUCATION_WITH_ACCOMMODATIONS_SCIENCE_Scale_Score_Mean 4 chars 0
## 2 NA 551 columns 549 columns
## 3 SPECIAL_EDUCATION_WITH_ACCOMMODATIONS_SCIENCE_Scale_Score_Mean 4 chars 2
## ... .............................................................. ........... ...........
## See problems(...) for more details.
## Error: NA column indexes not supported
There's a more data out there than just the grade 3 NJASK, though. In mynext post, we'll tackle the HSPA, and dive into some older data - the 'Grade Eight Proficiency Assessment' (GEPA) that used to be offered to students at the end of middle school, with the ultimate goal of stitching everything together into a simplified R interface for NJ assessment data.