Collecting Data

Scraping Multiple HTML Data Tables:


from gazpacho import get, Soup # Import gazpacho


url = "https://tax.alaska.gov/programs/oil/production/ans.aspx?" # Set the primary url value as a "string"


html = get(url) # Get the html from the url

soup = Soup(html) # Pass html into soup

list = soup.find('option')[1:] # Find all the 'option' html tags and add to list

length = len(list) # Number of records in the list (month/year)

urls = [] # Create an empty list for all the urls


for i in range(length): # Loop through all the html tags for 'options' for the length of the list

options = list[i].attrs['value']

# print(url + options)

urls.append(url + options) # and add the value to the urls list

url_count = len(urls) # Number of urls in the list


for url in urls:

scraped_df = pd.read_html(url)[6] # Read the 7th data table from each url

scraped_df = pd.DataFrame(scraped_df) # Set scraped_df as a dataframe

scraped_df = scraped_df.dropna(axis=1, how='all')

scraped_df = scraped_df.dropna(axis=0, how='all')

header_row = 1 # Identify row with header values

scraped_df.columns = scraped_df.iloc[header_row] # Rename columns equal to row with header values

scraped_df['Date'] = pd.to_datetime(scraped_df['Date'], errors='coerce') # Convert the 'Date' column to datetime datatype

scraped_df = scraped_df.dropna(axis=0, how='any', subset=['Date']) # Drop any rows with NaN in 'Date'

scraped_df = scraped_df.dropna(axis=1, how='all') # Drop columns that are all NaN

cols = scraped_df.columns.drop('Date') # Create a variable for all columns except 'Date'

scraped_df[cols] = scraped_df[cols].apply(pd.to_numeric, errors='coerce') # Converted all the columns to numeric

file_df = pd.read_csv('Resources/alaska_oil_production.csv') # Read previously scraped data file into Pandas

file_df['Date'] = pd.to_datetime(file_df['Date'], errors='coerce')

merged_df = file_df.append(scraped_df) # Merge the scraped_df to the file_df

merged_df = merged_df.sort_values(by='Date', ascending=False) # Sort by 'Date' in descending order

merged_df['duplicates'] = merged_df.duplicated('Date', keep='last') # Create a column to assign duplicates T/F

merged_df = merged_df[merged_df['duplicates'] == False] # Save df with duplicates equal to False only

merged_df = merged_df.drop(columns=['duplicates'])# Delete the 'duplicates' column if dupe_count = 0

merged_df = merged_df.reset_index(drop=True) # Reset the merged dataframe index

merged_df.to_csv('Resources/alaska_oil_production.csv', mode='w', index=False) # Write merged dataframe to existing csv


akoil-collecting-data-webscraping.ipynb

alaska_oil_production.csv

Collecting Data via API:


import pandas as pd

import requests


from config import eia_key


series_id = 'PET.F005071__3.M'

url = 'http://api.eia.gov/series/?'

query_url = f'{url}api_key={eia_key}&series_id={series_id}' # Build query

query = requests.get(query_url).json()

series = query['series']

series[0]['name']

name = series[0]['name']

data = series[0]['data']

df = pd.DataFrame(data)

df = df.rename(columns={0: "YYYYMM", 1: "spot_price"})

df.to_csv('Resources/eia_spot_prices.csv', mode='w', index=False) # Write merged dataframe to existing csv

# TO DO: parse the YYYYMM column to separate years from months to enable blending with with production data


akoil-collecting-data-api.ipynb

eia_spot_prices.csv