Clustering Recipes: Ingredients


Seeing what sort of information we can glean from clustering recipes by ingredient, using 1129 recipes from the Good Food website. Three clusters seem to have emerged: desserts (yellow), spicier foods (red), and ‘all the rest’ (blue).

Python Source

from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

import json
import os
import re
import numpy as np
from itertools import chain
# A set of the most common 10,000 words
with open("./common-words.txt", 'r') as infile:
    content = infile.readlines()

common_words = [x.strip() for x in content] 
# Load up the data and trim it down

path = "./data"
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

ingredient_sets = []
titles = []
for file in files:
    with open(os.path.join(path, file), 'r') as infile:  
        data = json.load(infile)
        words = data['ingredients']
        # Add in the title words, as they tend to mention ingredients
        words.append(data['title'])
        ingredient_sets.append(words)
        titles.append(data['title'])
        
# First take out anything that's non a-z or a space, then trim and lower the string

ingredient_sets = [[re.sub('[^a-zA-Z\s]', "", x).strip().lower() for x in ingredient_set] for ingredient_set in ingredient_sets]

# Try to prune ingredient lists down to actual ingredients

# Remove units
ingredient_sets = [[re.sub('\\b(x|can(s?)|frac|tbsp(s?)|tsp(s?)|g|l|ml|ozg|goz|oz|fl|kg|c|cup(s?)|tablespoon(s?)|teaspoon(s?)|handful(s?))\\b', "", x).strip() for x in ingredient_set] for ingredient_set in ingredient_sets]
# Remove blanks
ingredient_sets = [[x for x in ingredient_set if x] for ingredient_set in ingredient_sets]

# Split into words

per_ingredient_words = [[re.split('\s+', x) for x in ingredient_set] for ingredient_set in ingredient_sets]
  
# Remove common words, adjectives, and combine into one array
all_ingredient_words = [
    set([i for i in chain.from_iterable(ingredient_words) if i not in common_words and not i.endswith('ly') and not i.endswith('ed')]) for ingredient_words in per_ingredient_words]

# Rejoin back into space-separated strings
all_ingredient_words = [ " ".join(x) for x in all_ingredient_words ]
# Work out how important words are relative to other words in the collection
tfidf = TfidfVectorizer(stop_words="english")
X = tfidf.fit_transform(all_ingredient_words)

# Reduce the dimensionality, as we have a very sparse matrix
svd = TruncatedSVD(n_components=3)
svd_fit = svd.fit(X)
Y = svd.fit_transform(X) 

# Cluster!
model = AgglomerativeClustering(n_clusters=3)
_ = model.fit(Y)
# We need to shift it into something two-dimensional that we can visualize - t-SNE is a good approach here!
pos = TSNE(n_components=2, perplexity=30, n_iter=300).fit_transform(Y)
xs, ys = pos[:, 0], pos[:, 1]
clusters = model.labels_.tolist()
from bokeh.plotting import figure, output_notebook, output_file, show
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.models.tools import BoxZoomTool, ZoomInTool, ZoomOutTool, ResetTool
from bokeh.palettes import Spectral6
from bokeh.transform import linear_cmap
import pandas as pd

output_notebook()

mapper = linear_cmap(field_name='label', palette=Spectral6, low=min(df.label), high=max(df.label))
source = ColumnDataSource({'x':xs,'y':ys,'label':clusters,'title':titles,'words':all_ingredient_words})
hover = HoverTool(tooltips=[
    ("(x,y)", "(@x, @y)"),
    ('title', '@title'),
    ('words', '@words'),
])

p = figure(title="Recipe Ingredient Similarity", x_axis_label='x', y_axis_label='y', tools=[hover],plot_width=1000)

p.add_tools(BoxZoomTool())
p.add_tools(ZoomInTool())
p.add_tools(ZoomOutTool())
p.add_tools(ResetTool())

p.circle('x','y',source=source, color=mapper, size=12)
show(p)