Jupyter Notebook Code Walkthrough Day One¶
Global imports. Establish base URL Allrecipies queries.
In [118]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from fractions import Fraction
# URL of the Allrecipes search page
BASE_URL = "https://www.allrecipes.com/search?q="
Use Allrecipies query to get list of URLs containing recipies.
In [84]:
def get_recipe_list(search_term):
links = []
for page in range(0, 240, 24):
url = f"{BASE_URL}{search_term}&offset={page}"
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
recipe_cards = soup.select("a.mntl-card-list-card--extendable")
for card in recipe_cards:
link = card["href"] if "href" in card.attrs else "No link"
links.append(link)
else:
print(f"Failed to fetch the page. Status code: {response.status_code}")
return links
Iterate through list of URLs from above function. Download each webpage to temp folder.
In [107]:
def download_recipes(links, dir_title):
os.makedirs(dir_title, exist_ok=True)
for index, link in enumerate(links):
response = requests.get(link)
if response.status_code == 200:
fmt_index = str(index).zfill(3)
file_name = f"{fmt_index}.html"
file_path = os.path.join(dir_title, file_name)
with open(file_path, 'w', encoding="utf-8") as f:
f.write(response.text)
else:
print(f"Failed to fetch the page. Status code: {response.status_code}")
Utility function for converting unicode fraction characters to decimals.
In [143]:
def unicode_to_float(input_str):
if input_str == '':
return ''
input_str = input_str.replace('½', '1/2').replace('⅓', '1/3').replace('⅔', '2/3').replace('¼', '1/4').replace('¾', '3/4').replace('⅕', '1/5').replace('⅖', '2/5').replace('⅗', '3/5').replace('⅘', '4/5').replace('⅙', '1/6').replace('⅚', '5/6').replace('⅐', '1/7').replace('⅛', '1/8').replace('⅜', '3/8').replace('⅝', '5/8').replace('⅞', '7/8').replace('⅑', '1/9').replace('⅒', '1/10')
if ' ' in input_str:
parts = input_str.split()
if len(parts) == 2:
try:
whole = int(parts[0])
frac = Fraction(parts[1])
except ValueError:
whole = 0
frac = 0
input_str = str(whole + frac)
return float(Fraction(input_str))
Save a CSV file containing ingredients, units, amounts.
In [131]:
def ingredient_list(directory):
df = pd.DataFrame(columns=["ingredient_name", "ingredient_unit", "ingredient_quantity"])
for filename in os.listdir(directory):
if filename.endswith(".html"):
with open(os.path.join(directory, filename), 'r', encoding='utf-8') as f:
html_contents = f.read()
soup = BeautifulSoup(html_contents, "html.parser")
ingredients_list = soup.select("li.mm-recipes-structured-ingredients__list-item ")
for ingredient in ingredients_list:
ingredient_name = ingredient.select_one("span[data-ingredient-name='true']").text.strip() if ingredient.select_one("span[data-ingredient-name='true']") else "No ingredient name"
ingredient_unit = ingredient.select_one("span[data-ingredient-unit='true']").text.strip() if ingredient.select_one("span[data-ingredient-unit='true']") else "No ingredient unit"
ingredient_quantity = unicode_to_float(ingredient.select_one("span[data-ingredient-quantity='true']").text.strip()) if ingredient.select_one("span[data-ingredient-quantity='true']") else "No ingredient quantity"
df.loc[len(df)] = [ingredient_name, ingredient_unit, ingredient_quantity]
df.to_csv("ingredients.csv", index=False)
The "main" function. Combines all above functions.
In [108]:
def overarching(term):
dir = f"tmp_{term}"
links = get_recipe_list(term)
print("Recipe list retrieved")
download_recipes(links, dir)
print("Recipes downloaded")
ingredient_list(dir)
print("Ingredients list created")
In [120]:
overarching("ramen")