import json
import numpy as np
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
# Set a seed for consistant results
###############################################################################
# Load Data into pandas and Preprocess Features
###############################################################################
X = pd.read_csv('data/train.tsv', sep="\t", na_values=['?'], index_col=1)
X_test = pd.read_csv('data/test.tsv', sep="\t", na_values=['?'], index_col=1)
y = X['label']
X = X.drop(['label'], axis=1)
# Combine test and train while we do our preprocessing
X_all = pd.concat([X_test, X])
X_all['boilerplate'] = X_all['boilerplate'].apply(json.loads)
# Initialize the data as a unicode string
X_all['body'] = u'empty'
for row in X_all.index:
if 'body' in X_all['boilerplate'][row].keys():
if pd.isnull(X_all['boilerplate'][row]['body']):
X_all['body'][row] = u'empty'
else:
X_all['body'][row] = X_all['boilerplate'][row]['body']
body_counter = CountVectorizer()
body_counts = body_counter.fit_transform(X_all['body'])
# Re-seperate the test and training rows
bodies = body_counts[len(X_test.index):]
bodies_test = body_counts[:len(X_test.index)]
# Fit a model and predict
model = BernoulliNB()
model.fit(bodies, y)
preds = model.predict_proba(bodies_test)[:,1]
pred_df = pd.DataFrame(preds, index=X_test.index, columns=['label'])
pred_df.to_csv('result21.csv')
'etc' 카테고리의 다른 글
영상인식 논문 검색 (0) | 2013.10.05 |
---|---|
powered by/ designed by (0) | 2013.09.24 |
sungmin (0) | 2013.09.15 |
command (0) | 2013.09.06 |
Data viewer (0) | 2013.09.01 |