- I've used kaggle as source of data. I did find this as well API I did not use as it is so slow for me
- data I've used has no dead trees, just poor, fair and good as status
- I have used pandas-percentage-of-total-with-groupby technique for calculating percentages
- I prefer plotly to matplotlib for plotting. Both are simple to use
- there really are too many bars for this to be a high quality visualisation
get data from API (kaggle)
import kaggle.cli
import sys
import pandas as pd
from pathlib import Path
from zipfile import ZipFile
# search for data set
# sys.argv = [sys.argv[0]] + "datasets list -s \"2015-street-tree-census-tree-data.csv\"".split(" ")
# kaggle.cli.main()
# download data set
sys.argv = [sys.argv[0]] + "datasets download new-york-city/ny-2015-street-tree-census-tree-data".split(" ")
kaggle.cli.main()
zfile = ZipFile("ny-2015-street-tree-census-tree-data.zip")
zfile.infolist()
# use CSV
df = pd.read_csv(zfile.open(zfile.infolist()[0]))
prepare data and plot using plotly
import plotly.express as px
spc = 'spc_common'
# aggregate the data and shape it for plotting
dfa = (
df.groupby([spc, "health"])
.agg({"tree_id": "count"})
.groupby(level=spc)
.apply(lambda x: x / x.sum())
.unstack("health")
.droplevel(0, 1)
)
fig = px.bar(
dfa.reset_index(),
x=spc,
y=["Poor", "Fair", "Good"],
color_discrete_sequence=["red", "blue", "green"],
)
fig.update_layout(yaxis={"tickformat": "%"})
output

matplotlib
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(14, 3))
dfa.plot(kind="bar", stacked=True, ax=ax)