Collecting Data
Scraping Multiple HTML Data Tables:
from gazpacho import get, Soup # Import gazpacho
url = "https://tax.alaska.gov/programs/oil/production/ans.aspx?" # Set the primary url value as a "string"
html = get(url) # Get the html from the url
soup = Soup(html) # Pass html into soup
list = soup.find('option')[1:] # Find all the 'option' html tags and add to list
length = len(list) # Number of records in the list (month/year)
urls = [] # Create an empty list for all the urls
for i in range(length): # Loop through all the html tags for 'options' for the length of the list
options = list[i].attrs['value']
# print(url + options)
urls.append(url + options) # and add the value to the urls list
url_count = len(urls) # Number of urls in the list
for url in urls:
scraped_df = pd.read_html(url)[6] # Read the 7th data table from each url
scraped_df = pd.DataFrame(scraped_df) # Set scraped_df as a dataframe
scraped_df = scraped_df.dropna(axis=1, how='all')
scraped_df = scraped_df.dropna(axis=0, how='all')
header_row = 1 # Identify row with header values
scraped_df.columns = scraped_df.iloc[header_row] # Rename columns equal to row with header values
scraped_df['Date'] = pd.to_datetime(scraped_df['Date'], errors='coerce') # Convert the 'Date' column to datetime datatype
scraped_df = scraped_df.dropna(axis=0, how='any', subset=['Date']) # Drop any rows with NaN in 'Date'
scraped_df = scraped_df.dropna(axis=1, how='all') # Drop columns that are all NaN
cols = scraped_df.columns.drop('Date') # Create a variable for all columns except 'Date'
scraped_df[cols] = scraped_df[cols].apply(pd.to_numeric, errors='coerce') # Converted all the columns to numeric
file_df = pd.read_csv('Resources/alaska_oil_production.csv') # Read previously scraped data file into Pandas
file_df['Date'] = pd.to_datetime(file_df['Date'], errors='coerce')
merged_df = file_df.append(scraped_df) # Merge the scraped_df to the file_df
merged_df = merged_df.sort_values(by='Date', ascending=False) # Sort by 'Date' in descending order
merged_df['duplicates'] = merged_df.duplicated('Date', keep='last') # Create a column to assign duplicates T/F
merged_df = merged_df[merged_df['duplicates'] == False] # Save df with duplicates equal to False only
merged_df = merged_df.drop(columns=['duplicates'])# Delete the 'duplicates' column if dupe_count = 0
merged_df = merged_df.reset_index(drop=True) # Reset the merged dataframe index
merged_df.to_csv('Resources/alaska_oil_production.csv', mode='w', index=False) # Write merged dataframe to existing csv
Collecting Data via API:
import pandas as pd
import requests
from config import eia_key
series_id = 'PET.F005071__3.M'
url = 'http://api.eia.gov/series/?'
query_url = f'{url}api_key={eia_key}&series_id={series_id}' # Build query
query = requests.get(query_url).json()
series = query['series']
series[0]['name']
name = series[0]['name']
data = series[0]['data']
df = pd.DataFrame(data)
df = df.rename(columns={0: "YYYYMM", 1: "spot_price"})
df.to_csv('Resources/eia_spot_prices.csv', mode='w', index=False) # Write merged dataframe to existing csv
# TO DO: parse the YYYYMM column to separate years from months to enable blending with with production data