Clustering Recipes: Ingredients

April 5, 2019
food
data

Seeing what sort of information we can glean from clustering recipes by ingredient, using 1129 recipes from the Good Food website. Three clusters seem to have emerged: desserts (yellow), spicier foods (red), and ‘all the rest’ (blue).

Python Source

from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

import json
import os
import re
import numpy as np
from itertools import chain

# A set of the most common 10,000 words
with open("./common-words.txt", 'r') as infile:
    content = infile.readlines()

common_words = [x.strip() for x in content]

# Load up the data and trim it down

path = "./data"
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

ingredient_sets = []
titles = []
for file in files:
    with open(os.path.join(path, file), 'r') as infile:  
        data = json.load(infile)
        words = data['ingredients']
        # Add in the title words, as they tend to mention ingredients
        words.append(data['title'])
        ingredient_sets.append(words)
        titles.append(data['title'])
        
# First take out anything that's non a-z or a space, then trim and lower the string

ingredient_sets = [[re.sub('[^a-zA-Z\s]', "", x).strip().lower() for x in ingredient_set] for ingredient_set in ingredient_sets]

# Try to prune ingredient lists down to actual ingredients

# Remove units
ingredient_sets = [[re.sub('\\b(x|can(s?)|frac|tbsp(s?)|tsp(s?)|g|l|ml|ozg|goz|oz|fl|kg|c|cup(s?)|tablespoon(s?)|teaspoon(s?)|handful(s?))\\b', "", x).strip() for x in ingredient_set] for ingredient_set in ingredient_sets]
# Remove blanks
ingredient_sets = [[x for x in ingredient_set if x] for ingredient_set in ingredient_sets]

# Split into words

per_ingredient_words = [[re.split('\s+', x) for x in ingredient_set] for ingredient_set in ingredient_sets]
  
# Remove common words, adjectives, and combine into one array
all_ingredient_words = [
    set([i for i in chain.from_iterable(ingredient_words) if i not in common_words and not i.endswith('ly') and not i.endswith('ed')]) for ingredient_words in per_ingredient_words]

# Rejoin back into space-separated strings
all_ingredient_words = [ " ".join(x) for x in all_ingredient_words ]

# Work out how important words are relative to other words in the collection
tfidf = TfidfVectorizer(stop_words="english")
X = tfidf.fit_transform(all_ingredient_words)

# Reduce the dimensionality, as we have a very sparse matrix
svd = TruncatedSVD(n_components=3)
svd_fit = svd.fit(X)
Y = svd.fit_transform(X) 

# Cluster!
model = AgglomerativeClustering(n_clusters=3)
_ = model.fit(Y)

# We need to shift it into something two-dimensional that we can visualize - t-SNE is a good approach here!
pos = TSNE(n_components=2, perplexity=30, n_iter=300).fit_transform(Y)
xs, ys = pos[:, 0], pos[:, 1]
clusters = model.labels_.tolist()

from bokeh.plotting import figure, output_notebook, output_file, show
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.models.tools import BoxZoomTool, ZoomInTool, ZoomOutTool, ResetTool
from bokeh.palettes import Spectral6
from bokeh.transform import linear_cmap
import pandas as pd

output_notebook()

mapper = linear_cmap(field_name='label', palette=Spectral6, low=min(df.label), high=max(df.label))
source = ColumnDataSource({'x':xs,'y':ys,'label':clusters,'title':titles,'words':all_ingredient_words})
hover = HoverTool(tooltips=[
    ("(x,y)", "(@x, @y)"),
    ('title', '@title'),
    ('words', '@words'),
])

p = figure(title="Recipe Ingredient Similarity", x_axis_label='x', y_axis_label='y', tools=[hover],plot_width=1000)

p.add_tools(BoxZoomTool())
p.add_tools(ZoomInTool())
p.add_tools(ZoomOutTool())
p.add_tools(ResetTool())

p.circle('x','y',source=source, color=mapper, size=12)
show(p)