library(tidyverse)
library(knitr)
library(rvest)
library(broom)
library(dplyr)
library(purrr)
library(ggplot2)
library(plotly)
library(readxl)
library(reshape2)
library(akima)
library(wordcloud)
library(tm)

Import and tidy up data

# NYC zip code
url = "https://p8105.com/data/zip_codes.html"
ny_zip_codes = read_html(url) |>
  html_table() |> 
  data.frame() |> 
  janitor::clean_names() |> 
  mutate(
    borough = factor(
      county,
      levels=c("Bronx","Kings","New York", "Queens","Richmond"),
      labels=c("Bronx","Brooklyn","Manhattan","Queens",
 "Staten Island"))) 

valid_zipcodes = pull(ny_zip_codes, zip_code)

# Dog bite 
dog_bites_df = read_csv("data/Dog_Bites_Data.csv", na = c("NA", "", ".")) |> 
  janitor::clean_names() |> 
  rename_with(~ gsub("^x", "", .))


dog_bites_clean = dog_bites_df |> 
  mutate(date_of_bite = as.Date(date_of_bite, format = "%B %d %Y")) |> 
  mutate(year = format(date_of_bite, "%Y"),
         month = format(date_of_bite, "%m"),
         day = format(date_of_bite, "%d")) |> 
  mutate(year = as.factor(as.numeric(year)),
         month = factor(as.numeric(month), levels = 1:12),
         day = as.factor(as.numeric(day))
         ) |> 
  mutate(breed = str_to_lower(breed)) |> 
  mutate(breed = ifelse(grepl("mix|mxied|cross|\\s[xX]|-X|/|&|COLLILE,|yorkie poo|yorkie chon|yorkie-poo|yorkipoo|boxer beagle|shi-po|Doberman And Labrador", breed, ignore.case = TRUE), "Mixed", breed)) |> 
  mutate(breed = ifelse(grepl("bull", breed, ignore.case = TRUE), "Bull", breed)) |> 
  mutate(breed = ifelse(grepl("know|KOW|unsure|not given|certain|not sure|unc", breed, ignore.case = TRUE), "Unknown", breed)) |> 
  mutate(breed = ifelse(grepl("poo", breed, ignore.case = TRUE), "Poodle", breed)) |> 
  mutate(breed = ifelse(grepl("vizsla", breed, ignore.case = TRUE), "Vizsla", breed)) |> 
  mutate(breed = ifelse(grepl("shepher|sheherd|shepard|sheep|shpherd|sheperd", breed, ignore.case = TRUE), "Shepherd", breed)) |> 
  mutate(breed = ifelse(grepl("husky", breed, ignore.case = TRUE), "Husky", breed)) |> 
  mutate(breed = ifelse(grepl("chihuahua|chi hua", breed, ignore.case = TRUE), "Chihuahua", breed)) |>
  mutate(breed = ifelse(grepl("collie", breed, ignore.case = TRUE), "Collie", breed)) |> 
  mutate(breed = ifelse(grepl("cattle", breed, ignore.case = TRUE), "Cattle dog", breed)) |> 
  mutate(breed = ifelse(grepl("yorkie|yorkshire", breed, ignore.case = TRUE), "Yorkshire", breed)) |> 
  mutate(breed = ifelse(grepl("schnauzer", breed, ignore.case = TRUE), "Schnauzer", breed)) |> 
  mutate(breed = ifelse(grepl("coonhound", breed, ignore.case = TRUE), "Coonhound", breed)) |> 
  mutate(breed = ifelse(grepl("corgi", breed, ignore.case = TRUE), "Corgie", breed)) |> 
  mutate(breed = ifelse(grepl("dachshund", breed, ignore.case = TRUE), "Dachshund", breed)) |> 
  mutate(breed = ifelse(grepl("beagle", breed, ignore.case = TRUE), "Beagle", breed)) |> 
  mutate(breed = ifelse(grepl("west", breed, ignore.case = TRUE), "Westie", breed)) |> 
  mutate(breed = ifelse(grepl("mastiff", breed, ignore.case = TRUE), "Mastiff", breed)) |> 
  mutate(breed = ifelse(grepl("malti tzu|maltese", breed, ignore.case = TRUE), "Maltese", breed)) |> 
  mutate(breed = ifelse(grepl("shih tzu|Shichon|Shichi", breed, ignore.case = TRUE), "Shih tzu", breed)) |> 
  mutate(breed = ifelse(grepl("parson", breed, ignore.case = TRUE), "Parson", breed)) |> 
  mutate(breed = ifelse(grepl("nova", breed, ignore.case = TRUE), "Nova Scotia Duck Tolling Retriever", breed)) |> 
  mutate(breed = ifelse(grepl("Staffordshire", breed, ignore.case = TRUE), "Staffordshire", breed)) |> 
  mutate(breed = ifelse(grepl("skan mal", breed, ignore.case = TRUE), "alaskan malamute", breed)) |> 
  mutate(breed = ifelse(grepl("russell terr", breed, ignore.case = TRUE), "russell terrier", breed)) |> 
  mutate(breed = ifelse(grepl("shiba", breed, ignore.case = TRUE), "Shiba", breed)) |> 
  mutate(breed = ifelse(grepl("American Terrier", breed, ignore.case = TRUE), "American Terrier", breed)) |> 
  mutate(breed = ifelse(grepl("Golden", breed, ignore.case = TRUE), "Golden Doodle", breed)) |>
  mutate(breed = ifelse(grepl("Springer", breed, ignore.case = TRUE), "Springer", breed)) |> 
  mutate(breed = ifelse(grepl("Catahoula", breed, ignore.case = TRUE), "Catahoula", breed)) |> 
  mutate(breed = ifelse(grepl("Crested", breed, ignore.case = TRUE), "Crested", breed)) |> 
  mutate(breed = ifelse(grepl("Spaniel", breed, ignore.case = TRUE), "Spaniel", breed)) |> 
  mutate(breed = ifelse(grepl("Dandie Dinmont", breed, ignore.case = TRUE), "Dandie Dinmont", breed)) |> 
  mutate(breed = ifelse(grepl("Pug", breed, ignore.case = TRUE), "Puggle", breed)) |> 
  mutate(breed = ifelse(grepl("Potcake", breed, ignore.case = TRUE), "Potcake", breed)) |> 
  mutate(breed = ifelse(grepl("Border Terrier", breed, ignore.case = TRUE), "Border Terrier", breed)) |> 
  mutate(breed = ifelse(grepl("Blue He", breed, ignore.case = TRUE), "Blue Heeler", breed)) |> 
  mutate(breed = ifelse(grepl("Bernedoodle", breed, ignore.case = TRUE), "Bernadoodle", breed)) |> 
  mutate(breed = ifelse(grepl("ne Corso", breed, ignore.case = TRUE), "Caine Corso", breed)) |> 
  mutate(breed = ifelse(grepl("ton De Tulear", breed, ignore.case = TRUE), "Cotton De Tulear", breed)) |> 
  mutate(breed = ifelse(grepl("Daschound|Daschund|Dachshund", breed, ignore.case = TRUE), "Daschund", breed)) |> 
  mutate(breed = ifelse(grepl("Pomsk", breed, ignore.case = TRUE), "Pomski", breed)) |> 
  mutate(breed = ifelse(grepl("Miniature Labradoodle", breed, ignore.case = TRUE), "Mini Labordoodle", breed)) |> 
  mutate(breed = ifelse(grepl("Miniature Pinscher", breed, ignore.case = TRUE), "Mini Pincher", breed)) |> 
  mutate(breed = ifelse(grepl("Wheaten Terrier|wheaton", breed, ignore.case = TRUE), "Wheaton Terrier", breed)) |>
  mutate(breed = ifelse(grepl("Retriever", breed, ignore.case = TRUE), "Retreiver", breed)) |>
  mutate(breed = ifelse(grepl("Dogue De Bord", breed, ignore.case = TRUE), "Dogue De Bordeaux", breed)) |>
  mutate(breed = ifelse(grepl("Medium", breed, ignore.case = TRUE), "Medium", breed)) |>
  mutate(breed = ifelse(grepl("small", breed, ignore.case = TRUE), "small", breed)) |>
  mutate(breed = ifelse(grepl("Wolfhound", breed, ignore.case = TRUE), "Wolfhound", breed)) |>
  mutate(breed = ifelse(grepl("Terrier", breed, ignore.case = TRUE), "Terrier", breed)) |>
  mutate(breed = ifelse(grepl("small", breed, ignore.case = TRUE), "small", breed)) |>
  mutate(breed = str_to_title(breed))

# Dog license
dog_licensing_df = read_csv("data/NYC_Dog_Licensing_Dataset.csv", na = c("NA", "", ".")) |> 
  janitor::clean_names() |> 
  rename_with(~ gsub("^x", "", .))

This project used three datasets: NYC zip code data, NYC dog bite incidents data, and NYC dog licensing data. The NYC zip code dataset was obtained from the P8105 course website; the NYC dog bite incidents dataset was obtained from NYC Open Data; the NYC dog licensing dataset was obtained from the DOHMH Dog Licensing System.

In our exploratory data analysis below, we will examine how dog bite incidents are distributed by different variables in our data, including breed, gender, neuter status, borough, time, and license status.

Dog bites by breeds

top_breeds = dog_bites_clean |> 
  group_by(breed) |> 
  summarise(count = n()) |> 
  arrange(desc(count)) |> 
  filter(!is.na(breed) & breed != "unknown") |> 
  slice_head(n = 10)

top_breed_inter = plot_ly(
  data = top_breeds,
  labels = ~breed,
  values = ~count,
  type = 'pie',
  textinfo = 'label+percent',
  hoverinfo = 'label+value+percent',
  marker = list(line = list(color = '#FFFFFF', width = 2))
) |>
  layout(
    title = "Top 10 Breeds Involved in Bite Count",
    xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
    yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE)
  )

top_breed_inter

The top 10 dog breeds that contributed to the most dog bite incidents are shown in the pie chart above. Besides the mixed and the unknown categories, the breed that contributed to the most dog bite incidents are bulls (29.2%), followed by shepards (5.06%) and shih tzus (4.51%).

Despite delibrate data cleaning, there are still many dogs in our dataset whose breed information were either unknown or too complicated to be grouped into a single breed. The mixed and unknown categories consist of almost half of the pie, which indicates that there might be errors in the data collection process in the first place that could likely affect the make up of this pie chart and our conclusion.

bite_breed_counts <- dog_bites_clean |>
  mutate(breed = tolower(breed)) |>  
  filter(!is.na(breed), breed != "unknown", breed != "mixed") |>  
  count(breed, name = "count") |>  
  arrange(desc(count))  

set.seed(1234) 

wordcloud(
  words = pull(bite_breed_counts, breed), 
  freq = pull(bite_breed_counts, count), 
  min.freq = 1,
  max.words = 200,  
  random.order = FALSE,
  rot.per = 0.35,
  colors = brewer.pal(8, "Dark2")
)

The above word cloud consists of all breeds recorded in the dog bite incident dataset. The most frequent breed names are “bulls,” “shepards,” “shih tzus,” and “chihuahuas,” which is consistent with results from the pie chart.

Dog bites by gender and neuter status

gender_spay_plot = ggplot(dog_bites_clean, aes(x = gender, fill = spay_neuter)) +
  geom_bar(position = "dodge") +
  theme_minimal() +
  labs(title = "Dog Bites by Gender and Spay/Neuter Status",
       x = "Gender",
       y = "Count of Dog Bites",
       fill = "Spay/Neuter Status") +
  scale_fill_brewer(palette = "Set1") +
  theme(plot.title = element_text(hjust = 0.5, size = 15))

# Display the static plot
gender_spay_plot_inter = ggplotly(gender_spay_plot)
gender_spay_plot_inter

Across dog bite data with known genders, dog bite incidents appeared more than twice as frequent for male dogs compared to female dogs, for both neutered and un-neutered dogs. Comparing their neutered status, there is only a slight difference in dog bite incidence between the neutered and un-neutered groups. Neutered female dogs contributed to slightly more dog bite incidents than un-neitered female dogs, while un-neutered male dogs contributed to slightly more dog bite incidents.

It is worth noting that a significant portion of dog bite data had missing data for the dogs’ gender, and that among the unknown gender group, most dogs appeared to be un-neutered. The amount of missing information in these two variables could be leading us to an incomplete analysis of how dog bites were associated with gender and neuter status.

NYC Dog Licensing Counts

breed_counts <- dog_licensing_df |>
  mutate(breed_name = tolower(breed_name)) |> 
  filter(!is.na(breed_name), breed_name != "unknown") |>
  count(breed_name, name = "count") |>  
  arrange(desc(count)) 

set.seed(1234)  
wordcloud(words = breed_counts$breed_name, 
          freq = breed_counts$count, 
          min.freq = 1, 
          max.words = 200, 
          random.order = FALSE, 
          rot.per = 0.35, 
          colors = brewer.pal(8, "Dark2"))

Dog licensing data can help us infer the dog populations living in each borough, since it is required for all dogs living in New York City to be licensed. The word cloud above illustrates all the breed names recorded in the dog licensing dataset. The most notable and frequent breed names are: shih tzu, chihuahua, labrador retriever, maltese, etc.

Dog bites by borough

ny_zip_codes_filtered = ny_zip_codes |> 
  dplyr::select(zip_code, borough)

license_borough = dog_licensing_df |> 
  filter(zip_code %in% valid_zipcodes) |> 
  merge( ny_zip_codes_filtered, by = "zip_code", all.x = TRUE)

bites_by_borough = dog_bites_clean |>
  group_by(borough) |>
  summarise(Bite_Count = n())

licenses_by_borough = license_borough |>
  group_by(borough) |>
  summarise(License_Count = n())


borough_data = merge(bites_by_borough, licenses_by_borough, by = "borough", all = TRUE) |> 
  drop_na()

borough_data_long = borough_data |>
  pivot_longer(cols = c("License_Count", "Bite_Count"),
               names_to = "Count_Type",
               values_to = "Count")

borough_grouped_bar_plot = ggplot(borough_data_long, aes(x = borough, y = Count, fill = Count_Type)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7), width = 0.6) +
  scale_fill_manual(name = "Count Type", values = c("License_Count" = "skyblue", "Bite_Count" = "orange"),
                    labels = c("License_Count" = "Licenses", "Bite_Count" = "Bites")) +
  labs(title = "Dog Licenses and Bites by Borough", x = "Borough", y = "Count") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 15))

borough_interactive = ggplotly(borough_grouped_bar_plot)
borough_interactive

Looking at license counts across boroughs, Manhattan has the most licensed dogs, followed by Brooklyn, Queens, Bronx, and Staten Island. Looking at dog bite incidents across these boroughs, however, Queens reported the most incidents, followed by Manhattan, Brooklyn, Bronx, and lastly Staten Island. The mismatch between these two sets of bar likely implies that Queens may be a hotspot for dog bite incidents. We will take a closer look at this by analyzing the bite-to-license ratio in the pie charts below.

bite_borough_data = borough_data |>
  mutate(Bite_Percentage = Bite_Count / sum(Bite_Count) * 100,
         Bite_Label = paste0(borough, " (", round(Bite_Percentage, 1), "%)"))

bite_pie_interactive = plot_ly(
  data = bite_borough_data,
  labels = ~borough,
  values = ~Bite_Count,
  type = 'pie',
  textinfo = 'label+percent',
  hoverinfo = 'label+value+percent',
  textfont = list(size = 8),
  title = "Dog Bites by Borough",
  domain = list(x = c(0, 0.3)), 
  marker = list(colors = RColorBrewer::brewer.pal(n = length(bite_borough_data$borough), name = "Set3"))
) 

license_pie_data = borough_data |>
  mutate(License_Percentage = License_Count / sum(License_Count) * 100,
         License_Label = paste0(borough, " (", round(License_Percentage, 1), "%)"))

license_pie_interactive = plot_ly(
  data = license_pie_data,
  labels = ~borough,
  values = ~License_Count,
  type = 'pie',
  textinfo = 'label+percent',
  hoverinfo = 'label+value+percent',
  textfont = list(size = 8),
  title = "Dog Licenses by Borough",
  domain = list(x = c(0.35, 0.65)),
  marker = list(colors = RColorBrewer::brewer.pal(n = length(license_pie_data$borough), name = "Set3"))
) 

rate_borough_data = borough_data |>
  mutate(Bite_to_License_Ratio = Bite_Count / License_Count,
         Ratio_Percentage = Bite_to_License_Ratio / sum(Bite_to_License_Ratio) * 100,
         Ratio_Label = paste0(borough, " (", round(Ratio_Percentage, 1), "%)"))
rate_borough_data |> 
  dplyr::select(borough, Bite_Count, License_Count, Bite_to_License_Ratio) %>% # 选择前四列
  mutate(Bite_to_License_Ratio = scales::percent(Bite_to_License_Ratio)) |>
  kable(col.names = c("Borough", "Bite Count", "License Count", "Bite to License Ratio (%)"),
        align = c("l", "c", "c", "c"),
  caption = "Dog Bite to License Ratio by Borough")

Dog Bite to License Ratio by Borough
Borough	Bite Count	License Count	Bite to License Ratio (%)
Bronx	3782	52627	7.19%
Brooklyn	4985	135596	3.68%
Manhattan	5270	182398	2.89%
Queens	5773	104288	5.54%
Staten Island	1872	44477	4.21%

ratio_pie_interactive = plot_ly(
  data = rate_borough_data,
  labels = ~borough,
  values = ~Bite_to_License_Ratio,
  type = 'pie',
  textinfo = 'label+percent',
  hoverinfo = 'label+value+percent',
  textfont = list(size = 8),
  title = "Ratio of Bites to Licenses by Borough",
  domain = list(x = c(0.7, 1)), 
  marker = list(colors = RColorBrewer::brewer.pal(n = length(rate_borough_data$borough), name = "Set3"))
) 


combined_pies <- subplot(
  bite_pie_interactive, 
  license_pie_interactive, 
  ratio_pie_interactive,
  nrows = 1,
  margin = 0.05 
) %>% layout(
  title = "Comprehensive Dog Data by Borough", 
  titlefont = list(size = 16, color = 'black'),
  legend = list(
    orientation = "h", 
    x = 0.5,            
    xanchor = "center", 
    y = -0.2,          
    yanchor = "bottom" 
  ),
  margin = list(b = 100) 
)
combined_pies

The first and second pie charts display the proportions of dog bites and dog licenses by boroughs, and they are consistent with the bar plot above. The third pie chart displays the bite-to-license ratios by boroughs. Looking at this metric, Queens has a high ratio of 23.6% but is topped by Bronx, which has the highest bite-to-license ratio of 30.6%. This means that Bronx is also a possible hotspot for dog bite incidents. In our spatial correlation analysis, we will examine areas with high dog bite incidence more in-depth.

Exploratory Data Analysis

Import and tidy up data

Dog bites by breeds

Dog bites by gender and neuter status

NYC Dog Licensing Counts

Dog bites by borough