In this stage of the project you need to collect data on locations that will be included. Each location should contain longitude and latitude as well as location name for later visualization. We recommend to select at maximum 9 locations to find the optimal route in the reasonable time.
You can use any source for locations but if you feel stuck you can try searching some datasets on Kaggle. There are many datasets with cities there which you can use for selecting the locations. Keep in mind that when you decide to take the dataset from Kaggle you need to check the quality and eventually preprocess the data, e.g. check incorrect values (if some points are out of reasonable range) or missing values.
pip install folium
pip install ortools
pip install osmnx
# import libraries
import pandas as pd
import numpy as np
import re
# for plotting
import folium
from folium import plugins
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
# for simple routing
import osmnx as ox #1.2.2
import networkx as nx #3.0
# for advanced routing
from ortools.constraint_solver import pywrapcp #9.6
from ortools.constraint_solver import routing_enums_pb2
# import data
df = pd.read_csv("C:\\Users\\User\\Documents\\TripleTen\\Code Jam\\Dataset\\df_2.csv")
# general info on df
df.info()
df.head()
Upon initial inspection, the only missing rows are located in the 'image' column, which is not needed and will be dropped. Because we will only be working with the 9 rows of the California National Parks, it is not necessary to check for duplicates.
# drop unnecessary columns
df = df.drop(['Image','Unnamed: 0'], axis=1)
# rename columns to fit style guidelines
df = df.rename(columns = {'Name':'name_of_park', 'Location':'location', 'Date established as park[7][12]': 'date_established_as_park', 'Area (2021)[13]': 'area_in_acres', 'Recreation visitors (2021)[11]': 'visitors_in_2021', 'Description':'description'})
# check changes
df.info()
# find all parks located in State of California
df = df.loc[df['location'].str.contains("California")]
# check changes
df
The new df contains no missing or duplicate values and consists of the 9 national parks located in California. These are the locations from which will be optimizing routes.
# convert 'date_established_as_park' to datetime data type
df['date_established_as_park'] = pd.to_datetime(df['date_established_as_park'])
# split location column
location = df.location.str.split(r'(\d)',n=1,expand=True)
# rename
location = location.rename(columns={location.columns[0]:'state'})
# add separator back into column
location[2] = location[1] + location[2]
# drop separator column
location = location.drop([1],axis=1)
# split lat-lon coordinates into 2 columns
location[['loc1','loc2']] = location[2].str.split('/',expand=True)
# drop 2nd lat-lon column
location = location.drop([2,'loc2'],axis=1)
# check changes
location
# split loc1 column into lat and lon columns
location[['lat','lon']] = location.loc1.str.split(' ',n=1,expand=True)
# check changes
location
# create function to convert from degree minute format to degree decimals
def ddm2dec(dms_str):
sign = -1 if re.search('[swSW]', dms_str) else 1
numbers = re.split('\D+', dms_str)
degree = numbers[0]
minute_decimal = numbers[1]
return sign * (int(degree) + float(minute_decimal) / 60)
# replace ticks in string to single quote
location['lat'] = location.lat.str.replace("′","'")
location['lon'] = location.lon.str.replace("′","'")
# convert lat lon columns to degree decimal format
location['lat'] = location.lat.apply(lambda x:ddm2dec(x))
location['lon'] = location.lon.apply(lambda x:ddm2dec(x))
# check changes
location
# create latitude, longitude, and state columns in original df
df['lat'] = location['lat']
df['lon'] = location['lon']
df['state'] = location['state']
# check changes
df
# drop unnecessary original location and new loc1 columns
df = df.drop('location',axis=1)
# check changes
display(df)
# extract only numerical area info from area_in_acres column & convert to numeric
df['area_in_acres'] = df.area_in_acres.str.extract(r'(\d+[,.\d]*)')
df['area_in_acres'] = df.area_in_acres.apply(lambda x: x.replace(",",""))
df['area_in_acres'] = pd.to_numeric(df.area_in_acres)
# check changes
df.info()
# remove astericks from park names
df.name_of_park = df.name_of_park.apply(lambda x:x.replace(' *',''))
df.name_of_park.unique()
df.describe()
fig = px.scatter_geo(df, lat='lat',lon='lon', hover_name='name_of_park', size='visitors_in_2021', locationmode='USA-states',
height=500, width=800,
labels={'visitors_in_2021':'Visitors','lat':'Latitude','lon':'Longitude'})
fig.update_layout(
title_text='California National Parks and Visitors',
geo_scope='usa')
fig.show()
map = px.scatter_mapbox(df, lat='lat', lon='lon', hover_name='name_of_park',
title='US National Parks in California',size='visitors_in_2021',
center={'lat':36.7783,'lon':-119.4179},zoom=5,
width=700,height=900,
labels={'visitors_in_2021':'Visitors','lat':'Latitude','lon':'Longitude'})
map.update_layout(mapbox_style="open-street-map",geo=dict(scope='usa'))
map.show()
fig = px.scatter_geo(df, lat='lat', lon='lon', hover_name='name_of_park', size='area_in_acres', locationmode='USA-states',
height=500, width=800,
labels={'area_in_acres':'Area (acres)','lat':'Latitude','lon':'Longitude'})
fig.update_layout(
title_text='US National Parks & Area in Acres',
geo_scope='usa')
fig.show()
map = px.scatter_mapbox(df, lat='lat', lon='lon', hover_name='name_of_park',
title='US National Parks in California',size='area_in_acres',
center={'lat':36.7783,'lon':-119.4179},zoom=5,
width=700,height=900,
labels={'area_in_acres':'Area (Acres)','lat':'Latitude','lon':'Longitude'})
map.update_layout(mapbox_style="open-street-map",geo=dict(scope='usa'))
map.show()
# bar plot of recreation visitors by park
fig = plt.figure(figsize = (13, 5))
plt.bar(df['name_of_park'], df['visitors_in_2021'], color ='khaki',
width = 0.8)
plt.xlabel("California National Park")
plt.ylabel("No. of visitors in 2021")
plt.title("Recreational Visitors at California's National Parks in 2021")
plt.show()
# plot dates established as parks
date = df.sort_values(by='date_established_as_park')
fig = px.scatter(x=df['date_established_as_park'], y=df['name_of_park'], labels={"x": "Year", "y": "Park"}, title='Dates Established as US National Parks in the State of California',)
fig.show()
fig = px.scatter(df, x=df['date_established_as_park'], y=df['visitors_in_2021'], color=df['name_of_park'], symbol=df['name_of_park'], title='Year of Establishment and Number of Visitors in 2021', labels={"df['date_established_as_park']": "Date Established as Park", "y": "Number of Visitors in 2021"},)
fig.update_traces(marker_size=10)
fig.show()
Fun Fact:
Joshua Tree National Park in California had 3,064,400 visitors in 2021. This was second to only Yosemite, which was established as a park in 1890, over 100 years earlier. Joshua Tree's popularity may have something to do with its driving distance and accessability from major cities such as Los Angeles and San Diego.
# bar plot of area of parks
fig = plt.figure(figsize = (13, 5))
plt.bar(df['name_of_park'], df['area_in_acres'], color ='maroon',
width = 0.8)
plt.xlabel("National Park")
plt.ylabel("Area in Acres")
plt.title("California's National Parks by Size")
plt.show()
#correlation between visitors in 2021 and area in acres
corr = df['visitors_in_2021'].corr(df['area_in_acres'])
print('The correlation between the area of the parks in acres and the number of visitors is:', corr)
sns.regplot(data=df, x="area_in_acres", y="visitors_in_2021")
plt.title('Correlation of Park Size to Visitors in 2021')
plt.xlabel('Park Size')
plt.ylabel('Visitors in 2021')
plt.show()
The seemingly low (0.228) correlation coefficient sugguests little to no relationship between the two columns.
df
df['text'] = df['name_of_park'] + '<br>Area ' + (df['area_in_acres']).astype(str)+' acres'
limits = [(0,2),(2,5),(5,7),(7,9)]
colors = ["royalblue","crimson","lightseagreen","orange","lightgrey"]
cities = []
scale = 5000
fig = go.Figure()
for i in range(len(limits)):
lim = limits[i]
df_sub = df[lim[0]:lim[1]]
fig.add_trace(go.Scattergeo(
locationmode = 'USA-states',
lon = df_sub['lon'],
lat = df_sub['lat'],
text = df_sub['text'],
marker = dict(
size = df_sub['area_in_acres']/scale,
color = colors[i],
line_color='rgb(40,40,40)',
line_width=0.5,
sizemode = 'area'
),
name = '{0} - {1}'.format(lim[0],lim[1])))
fig.update_layout(
title_text = 'California National Park Areas<br>(Click legend to toggle traces)',
showlegend = True,
geo = dict(
scope = 'usa',
landcolor = 'rgb(217, 217, 217)',
)
)
fig.show()
df.set_index('name_of_park',inplace=True)
sites = df.loc[:,['lat','lon']]
sites
# function to get distance between points using open source routing
def get_distance(point1: dict, point2: dict) -> tuple:
"""Gets distance between two points en route using http://project-osrm.org/docs/v5.10.0/api/#nearest-service"""
url = f"""http://router.project-osrm.org/route/v1/driving/{point1["lon"]},{point1["lat"]};{point2["lon"]},{point2["lat"]}?overview=false&alternatives=false"""
r = requests.get(url)
# get the distance from the returned values
route = json.loads(r.content)["routes"][0]
return (route["distance"])
import requests
import json
# function to create distance matrix based on distance from open source routing
def compute_distance_matrix(dsites, dist_metric=get_distance):
""" Creates an N x N distance matrix from a dataframe of N locations
with a latitute column and a longitude column """
df_dist_matrix = pd.DataFrame(index=sites.index,
columns=sites.index)
for orig, orig_loc in sites.iterrows(): # for each origin
for dest, dest_loc in sites.iterrows(): # for each destination
df_dist_matrix.at[orig, dest] = get_distance(orig_loc, dest_loc)
return df_dist_matrix
df_distances = compute_distance_matrix(sites)
display(df_distances)
# convert distance matrix to array
distance_matrix = df_distances.values
distance_matrix
pip install python-tsp
from python_tsp.exact import solve_tsp_brute_force
order, distance_tot = solve_tsp_brute_force(distance_matrix)
order
distance_tot