I have downloaded a list of all the towns and cities etc in the US from the census bureau. Here is a random sample:
dput(somewhere)
structure(list(state = structure(c(30L, 31L, 5L, 31L, 24L, 36L,
13L, 21L, 6L, 10L, 31L, 28L, 10L, 5L, 5L, 8L, 23L, 11L, 34L,
19L, 29L, 4L, 24L, 13L, 21L, 31L, 2L, 3L, 29L, 24L, 1L, 13L,
15L, 10L, 11L, 33L, 35L, 8L, 11L, 12L, 36L, 28L, 9L, 31L, 8L,
14L, 11L, 12L, 36L, 13L, 8L, 5L, 29L, 8L, 7L, 23L, 25L, 39L,
16L, 28L, 10L, 29L, 26L, 8L, 32L, 40L, 28L, 23L, 37L, 31L, 18L,
5L, 1L, 31L, 18L, 13L, 11L, 10L, 25L, 18L, 21L, 18L, 11L, 35L,
31L, 36L, 20L, 19L, 38L, 2L, 40L, 13L, 36L, 11L, 29L, 27L, 22L,
17L, 12L, 20L), .Label = c("ak", "al", "ar", "az", "ca", "co",
"fl", "ga", "hi", "ia", "il", "in", "ks", "ky", "la", "md", "mi",
"mo", "ms", "mt", "nc", "nd", "ne", "nj", "nm", "nv", "ny", "oh",
"ok", "or", "pa", "pr", "ri", "sc", "sd", "tx", "ut", "va", "wa",
"wi"), class = "factor"), geoid = c(4120100L, 4280962L, 668028L,
4243944L, 3460600L, 4871948L, 2046000L, 3747695L, 839965L, 1909910L,
4244824L, 3902204L, 1963750L, 622360L, 669088L, 1382972L, 3125230L,
1722736L, 4539265L, 2804705L, 4039625L, 451465L, 3467020L, 2077150L,
3765260L, 4221792L, 133904L, 566320L, 4033150L, 3463180L, 223460L,
2013675L, 2232405L, 1951600L, 1752142L, 4445010L, 4655684L, 1336416L,
1729080L, 1840842L, 4804672L, 3932207L, 1523675L, 4260384L, 1321912L,
2159232L, 1735307L, 1867176L, 4839304L, 2057350L, 1309656L, 655380L,
4082250L, 1350680L, 1275475L, 3147745L, 3505010L, 5352285L, 2483337L,
3909834L, 1912945L, 4068200L, 3227900L, 1366304L, 7286831L, 5505350L,
3982390L, 3149915L, 4974480L, 4249440L, 2943346L, 677430L, 280770L,
4247872L, 2902242L, 2039075L, 1735281L, 1932565L, 3580120L, 2973852L,
3722620L, 2943238L, 1755938L, 4643100L, 4251904L, 4830920L, 3056575L,
2801940L, 5156048L, 137000L, 5508925L, 2057300L, 4861172L, 1736477L,
4021200L, 3677783L, 3832060L, 2614900L, 1820332L, 3043750L),
ansicode = c(2410344L, 2390453L, 2411793L, 1214759L, 885360L,
2412035L, 485621L, 2403359L, 2412812L, 2583481L, 2390095L,
2397971L, 2396237L, 2585422L, 2411819L, 2405746L, 2398338L,
2394628L, 2812929L, 2586582L, 2408478L, 2582836L, 885393L,
2397270L, 2402898L, 2584453L, 2405811L, 2405518L, 2412737L,
2389752L, 2418574L, 2393549L, 2402559L, 2629970L, 2399453L,
2378109L, 2812999L, 2402563L, 2398956L, 2396699L, 2409759L,
2393028L, 2414061L, 2805542L, 2404192L, 2404475L, 2398514L,
2629884L, 2408486L, 2396265L, 2405306L, 2411363L, 2413515L,
2405064L, 2402989L, 2583899L, 2629103L, 2585016L, 2390487L,
2397481L, 2393811L, 2413298L, 2583927L, 2812702L, 2415078L,
1582764L, 2400116L, 2400036L, 2412013L, 2633665L, 2787908L,
2583158L, 2418866L, 1214943L, 2393998L, 485611L, 2398513L,
2394969L, 2806756L, 2397053L, 2406485L, 2395719L, 2399572L,
1267480L, 2389516L, 2410660L, 2409026L, 2806379L, 2584894L,
2404746L, 2586459L, 2396263L, 2411528L, 2398556L, 2412443L,
2584298L, 1036064L, 2806333L, 2396920L, 2804282L), city = c("donald",
"warminster heights", "san juan capistrano", "littlestown",
"port republic", "taylor", "merriam", "northlakes", "julesburg",
"california junction", "lower allen", "antwerp", "pleasantville",
"el rancho", "santa clarita", "willacoochee", "kennard",
"effingham", "la france", "beechwood", "keys", "orange grove mobile manor",
"shiloh", "west mineral", "stony point", "east salem", "heath",
"stamps", "haworth", "rio grande", "ester", "clayton", "hackberry",
"middle amana", "new baden", "melville", "rolland colony",
"hannahs mill", "germantown hills", "la fontaine", "aurora",
"green meadows", "kaiminani", "pinecroft", "dawson", "park city",
"hinsdale", "st. meinrad", "kingsland", "powhattan", "bowersville",
"palos verdes estates", "wyandotte", "meigs", "waverly",
"sunol", "arroyo hondo", "outlook", "west pocomoke", "buchtel",
"chatsworth", "smith village", "glenbrook", "rock spring",
"villalba", "bayfield", "waynesfield", "utica", "sunset",
"milford square", "lithium", "swall meadows", "unalaska",
"martinsburg", "ashland", "leawood", "hindsboro", "gray",
"turley", "trimble", "falcon", "linn", "olympia fields",
"mitchell", "mount pleasant mills", "greenville", "park city",
"arkabutla", "new river", "huntsville", "boulder junction",
"potwin", "red lick", "huey", "dougherty", "wadsworth", "grand forks",
"chassell", "edgewood", "lindsay"), lsad = c("25", "57",
"25", "21", "25", "25", "25", "57", "43", "57", "57", "47",
"25", "57", "25", "25", "47", "25", "57", "57", "57", "57",
"21", "25", "57", "57", "43", "25", "43", "57", "57", "25",
"57", "57", "47", "57", "57", "57", "47", "43", "25", "57",
"57", "57", "25", "25", "47", "57", "57", "25", "43", "25",
"43", "25", "57", "57", "57", "57", "57", "47", "25", "43",
"57", "57", "62", "25", "47", "47", "25", "57", "57", "57",
"25", "21", "25", "25", "47", "25", "57", "25", "43", "25",
"47", "25", "57", "25", "57", "57", "57", "25", "57", "25",
"25", "47", "43", "57", "25", "57", "43", "57"), funcstat = c("a",
"s", "a", "a", "a", "a", "a", "s", "a", "s", "s", "a", "a",
"s", "a", "a", "a", "a", "s", "s", "s", "s", "a", "a", "s",
"s", "a", "a", "a", "s", "s", "a", "s", "s", "a", "s", "s",
"s", "a", "a", "a", "s", "s", "s", "a", "a", "a", "s", "s",
"a", "a", "a", "a", "a", "s", "s", "s", "s", "s", "a", "a",
"a", "s", "s", "s", "a", "a", "a", "a", "s", "s", "s", "a",
"a", "a", "a", "a", "a", "s", "a", "a", "a", "a", "a", "s",
"a", "s", "s", "s", "a", "s", "a", "a", "a", "a", "s", "a",
"s", "a", "s"), latitude = c(45.221487, 40.18837, 33.500889,
39.74517, 39.534798, 30.573263, 39.017607, 35.780523, 40.984864,
41.56017, 40.226748, 41.180176, 41.387011, 36.220684, 34.414083,
31.335094, 41.474697, 39.120662, 34.616281, 32.336723, 35.802786,
32.598451, 39.462418, 37.283906, 35.867809, 40.608713, 31.344839,
33.354959, 33.840898, 39.019051, 64.879056, 39.736866, 29.964958,
41.794765, 38.536765, 41.559549, 44.3437, 32.937302, 40.768954,
40.673893, 33.055942, 39.867193, 19.757709, 40.564189, 31.771864,
37.093499, 41.800683, 38.168142, 30.666141, 39.761734, 34.373065,
33.774271, 36.807143, 31.071788, 27.985282, 41.154105, 36.534599,
46.331153, 38.096527, 39.463511, 42.916301, 35.45079, 39.100123,
34.81467, 18.127809, 46.81399, 40.602442, 40.895279, 41.13806,
40.433182, 37.831844, 37.50606, 53.910255, 40.310917, 38.812464,
38.907263, 39.684775, 41.841711, 36.736661, 39.476152, 35.194804,
38.478798, 41.521996, 43.730057, 40.724697, 33.111939, 45.630946,
34.700227, 37.142945, 34.782275, 46.1148, 37.938624, 33.485081,
38.605285, 34.399808, 42.821447, 47.921291, 47.036116, 40.103208,
47.224885), longitude = c(-122.837813, -75.084089, -117.654388,
-77.089213, -74.476099, -97.427116, -94.693955, -81.367835,
-102.262708, -95.994752, -76.902769, -84.736099, -93.272787,
-119.068357, -118.494729, -83.044003, -96.203696, -88.550859,
-82.770697, -90.808692, -94.941358, -114.660588, -75.29244,
-94.926801, -81.044121, -77.23694, -86.46905, -93.497879,
-94.657035, -74.87787, -148.041153, -100.176484, -93.410178,
-91.901539, -89.707193, -71.301933, -96.59226, -84.340945,
-89.462982, -85.722023, -97.509615, -83.945334, -156.001765,
-78.353464, -84.443499, -86.048077, -87.928172, -86.832128,
-98.454026, -95.634011, -83.084305, -118.425754, -94.729305,
-84.092683, -81.625304, -102.762746, -105.666602, -120.092812,
-75.579197, -82.180426, -96.514499, -97.457006, -119.927289,
-85.238869, -66.481897, -90.822546, -83.973881, -97.345349,
-112.028388, -75.405024, -89.88325, -118.642656, -166.529029,
-78.324286, -92.239531, -94.62524, -88.134729, -94.985863,
-107.792147, -94.561898, -78.65389, -91.844989, -87.691648,
-98.029974, -77.026451, -96.110256, -108.925311, -90.121565,
-80.595817, -86.532599, -89.654438, -97.01835, -94.161474,
-89.289973, -97.05148, -77.893875, -97.08933, -88.530745,
-85.737461, -105.152791), designation = c("city", "cdp",
"city", "borough", "city", "city", "city", "cdp", "town",
"cdp", "cdp", "village", "city", "cdp", "city", "city", "village",
"city", "cdp", "cdp", "cdp", "cdp", "borough", "city", "cdp",
"cdp", "town", "city", "town", "cdp", "cdp", "city", "cdp",
"cdp", "village", "cdp", "cdp", "cdp", "village", "town",
"city", "cdp", "cdp", "cdp", "city", "city", "village", "cdp",
"cdp", "city", "town", "city", "town", "city", "cdp", "cdp",
"cdp", "cdp", "cdp", "village", "city", "town", "cdp", "cdp",
"urbana", "city", "village", "village", "city", "cdp", "cdp",
"cdp", "city", "borough", "city", "city", "village", "city",
"cdp", "city", "town", "city", "village", "city", "cdp",
"city", "cdp", "cdp", "cdp", "city", "cdp", "city", "city",
"village", "town", "cdp", "city", "cdp", "town", "cdp")), row.names = c(22769L,
24845L, 3314L, 24015L, 17360L, 28139L, 10085L, 19881L, 3886L,
8750L, 24027L, 20585L, 9362L, 2499L, 3333L, 6041L, 16321L, 6847L,
25249L, 14051L, 22233L, 1210L, 17425L, 10353L, 20053L, 23545L,
253L, 1951L, 22166L, 17386L, 685L, 9771L, 11134L, 9225L, 7386L,
25001L, 25862L, 5663L, 6950L, 8239L, 26555L, 20991L, 6108L, 24388L,
5551L, 10772L, 7056L, 8470L, 27292L, 10202L, 5451L, 3116L, 22660L,
5776L, 5317L, 16546L, 17582L, 29958L, 12103L, 20709L, 8779L,
22515L, 16665L, 5902L, 31901L, 30658L, 21745L, 16574L, 28632L,
24127L, 15046L, 3455L, 930L, 24087L, 14494L, 10016L, 7055L, 8993L,
18048L, 15434L, 19615L, 15043L, 7454L, 25775L, 24194L, 27115L,
15857L, 14038L, 29305L, 276L, 30693L, 10201L, 27863L, 7075L,
22046L, 19267L, 20311L, 12502L, 8093L, 15798L), class = "data.frame")
I would like to calculate the distance between each entry in the city column using the longitude and latitude columns and the gdist function. I know that the following for loop works and is easy to read:
dist_list <- list()
for (i in 1:nrow(somewhere)) {
dist_list[[i]] <- gdist(lon.1 = somewhere$longitude[i],
lat.1 = somewhere$latitude[i],
lon.2 = somewhere$longitude,
lat.2 = somewhere$latitude,
units="miles")
}
However: IT TAKES FOREVER to run on the full dataset (31K+ rows)--as in hours. I'm looking for something that will speed up this calculation. I figured something in the split-apply-combine approach would work well, since I would like to eventually involve a pair of grouping variables, geo_block and ansi_block but honestly anything would be better than what I have.
I have tried the following:
somewhere$geo_block <- substr(somewhere$geoid, 1, 1)
somewhere$ansi_block <- substr(somewhere$ansicode, 1, 1)
somewhere <- somewhere %>%
split(.$geo_block, .$ansi_block) %>%
mutate(dist = gdist(longlat = somewhere[, c("longitude", "latitude")]))
But am unsure of how to specify the second set of long-lat inputs outside of the standard for loop.
My question:
- How do I use the
split-apply-combineapproach to solve this problem withgeo_blockandansi_blockas a grouping variable as above? I would like to return the shortest distance and the name of thecityand the value ofgeo_blockcorresponding to this distance.
All suggestions are welcome. Ideally the desired result would be fairly quick because the actual dataset I'm working with is quite large. Since I'm a bit in the woods here, I've added a bounty to the question to generate a little more interest and hopefully a wide set of potential answers that I can learn from. Thanks so much!