What's Cooking

题目简介

链接: https://www.kaggle.com/c/whats-cooking/

描述: 通过菜肴的食材、调料以及烹饪手法等区分菜肴所属的菜系

样例: greek菜系

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
{
"id": 10259,
"cuisine": "greek",
"ingredients": [
"romaine lettuce",
"black olives",
"grape tomatoes",
"garlic",
"pepper",
"purple onion",
"seasoning",
"garbanzo beans",
"feta cheese crumbles"
]
}

参考解法

思路: $\left|\mathcal{Y}\right|$维感知机

要点: 减轻过拟合

训练集准确度: 83%

测试集准确度: 78%

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
## language python 3
import numpy as np
import json
import csv

eta = 1
alpha = 0.00005
n_iter = 30
r_decs = 0.88
r_slipt = 0.05

y_set = set()
x_set = set()

print('reading...')

with open("train.json", encoding = 'utf-8') as json_file:
train_D = json.load(json_file)
N_train = len(train_D)
for datai in train_D:
y_set.add(datai['cuisine'])
for itemj in datai['ingredients']:
x_set.add(itemj)

y_size = len(y_set)
x_size = len(x_set)

print('N_train:', N_train)
print('y_size:', y_size)
print('x_size:', x_size)

y_list = list(y_set)
x_list = list(x_set)

Y = np.zeros([N_train, y_size])
X = np.zeros([N_train, x_size])

print('initializing...')

for i in range(N_train):
datai = train_D[i]
Y[i, y_list.index(datai['cuisine'])] = 1
for item_j in datai['ingredients']:
X[i, x_list.index(item_j)] = 1

W = np.zeros([y_size, x_size])
for i in range(y_size):
for j in range(x_size):
W[i, j] = np.random.rand(1) - 0.5

print('training...')

y = np.zeros(y_size)
label = np.zeros(N_train)

for iter in range(n_iter):
label = np.random.rand(N_train)

print('traing:', iter + 1, ' time')
errors = 0
count = 0
for i in range(N_train):
if label[i] >= r_slipt :
for j in range(y_size):
y[j] = 1.0 / (1.0 + np.exp(0.0 - np.dot(W[j], X[i])))
for j in range(y_size):
W[j] = W[j] + eta * (Y[i, j] - y[j]) * X[i] - alpha * eta * W[j]

for i in range(N_train):
if label[i] < r_slipt :
for j in range(y_size):
y[j] = 1.0 / (1.0 + np.exp(0.0 -np.dot(W[j], X[i])))

y_maxindex = y.argmax(axis = 0)
Yi_maxindex = Y[i].argmax(axis = 0)
errors += int(Yi_maxindex != y_maxindex)
count += 1

eta = eta * r_decs
print('accuracy:', count - errors, '(', (1 - errors / count) * 100, '%) samples')

print('testing...')

with open('test.json', encoding = 'utf-8') as json_file:
test_D = json.load(json_file)
N_test = len(test_D)

with open('prediction.csv', 'w', newline = '') as csv_file:
fieldnames = ['id', 'cuisine']
writer = csv.DictWriter(csv_file, fieldnames = fieldnames)
writer.writeheader()

X_test = np.zeros([N_test, x_size])
for i in range(N_test):
datai = test_D[i]
xi = datai['ingredients']
id = datai['id']
for itemj in xi:
if itemj in x_list:
itemj_index = x_list.index(itemj)
X_test[i, itemj_index] = 1

for j in range(y_size):
y[j]=np.dot(W[j], X_test[i])
max_index = y.argmax(axis = 0)
cuisine = y_list[max_index]
writer.writerow({'id': id, 'cuisine': cuisine})

print('Done')