Amazon Alexa Reviews
Amazon Alexa, also known simply as Alexa, is a virtual assistant AI technology developed by Amazon, first used in the Amazon Echo smart speakers. It let you instantly connect to play music, control your smart home, get information, news, weather, and more
Check out my prediction at https://www.kaggle.com/rakshmithamadhevan/amazon-alexa-spacy
Now come on and let’s check it’s reviews!
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
# for advanced visualizations
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected = True)
import plotly.figure_factory as ff#loading english module
nlp = spacy.load('en')#reading the csv file
data = pd.read_csv('../input/amazon-alexa/amazon_alexa.tsv', delimiter = '\t', quoting = 3)data["variation"].head()
Word level attributes :
txt = data["verified_reviews"][1009]
txt
data.describe()
doc = nlp(txt)
olist = []
for token in doc:
l = [token.text,
token.idx,
token.lemma_,
token.is_punct,
token.is_space,
token.shape_,
token.pos_,
token.tag_]
olist.append(l)
odf = pd.DataFrame(olist)
odf.columns= ["Text", "StartIndex", "Lemma", "IsPunctuation", "IsSpace", "WordShape", "PartOfSpeech", "POSTag"]
odf
Just calling the function “nlp” on the text column gets us a lot of information. The details are as follows:
- Text — Tokenized word
- StartIndex — Index at which the word starts in the sentence
- Lemma — Lemma of the word (we need not do lemmatization separately)
- IsPunctuation — Whether the given word is a punctuation or not
- IsSpace — Whether the given word is just a white space or not
- WordShape — Gives information about the shape of word (If all letters are in upper case, we will get XXXXX, if all in lower case then xxxxx, if the first letter is upper and others lower then Xxxxx and so on)
- PartOfSpeech — Part of speech of the word
- POSTag — Tag for part of speech of word
Named Entity Recognition:
A named entity is a “real-world object” that’s assigned a name — for example, a person, a country, a product or a book title.
We also get named entity recognition as part of spacy package. It is inbuilt in the english language model and we can also train our own entities if needed.
doc = nlp(txt)
olist = []
for ent in doc.ents:
olist.append([ent.text, ent.label_])
odf = pd.DataFrame(olist)
odf.columns = ["Text", "EntityType"]
odf
from spacy import displacy
def explain_text_entities(text):
doc = nlp(text)
for ent in doc.ents:
print(f'Entity: {ent}, Label: {ent.label_}, {spacy.explain(ent.label_)}')
for i in range(15, 50):
one_sentence = data['verified_reviews'][i]
doc = nlp(one_sentence)
displacy.render(doc, style='ent', jupyter=True)
Dependency Parser
A dependency parser analyzes the grammatical structure of a sentence, establishing relationships between “head” words and words which modify those heads
Spacy can be used to create these dependency parsers which can be used in a variety of tasks
doc = nlp(data["verified_reviews"][1009])
olist = []
for token in doc:
olist.append([token.text, token.dep_, token.head.text, token.head.pos_,
[child for child in token.children]])
odf = pd.DataFrame(olist)
odf.columns = ["Text", "Dep", "Head text", "Head POS", "Children"]
odf
- Text: The original token text.
- Dep: The syntactic relation connecting child to head.
- Head text: The original text of the token head.
- Head POS: The part-of-speech tag of the token head.
- Children: The immediate syntactic dependents of the token.
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})
Word Similarity:
Spacy has word vector model as well. So we can use the same to find similar words.
Now we can use the cosine similarity to find the words that are similar to the word “Love”
from scipy import spatial
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
love = nlp.vocab['love'].vector
computed_similarities = []
for word in nlp.vocab:
# Ignore words without vectors
if not word.has_vector:
continue
similarity = cosine_similarity(love, word.vector)
computed_similarities.append((word, similarity))
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([w[0].text for w in computed_similarities[:10]])
queen = nlp.vocab['love']
happy = nlp.vocab['happy']
fun = nlp.vocab['fun']
kids = nlp.vocab['kids']
king = nlp.vocab['King']
print("Word similarity score between love and happy : ",queen.similarity(happy))
print("Word similarity score between love and funn : ",queen.similarity(fun))
Data Visualization
ratings = data['rating'].value_counts()
label_rating = ratings.index
size_rating = ratings.values
colors = ['pink', 'lightblue', 'aqua', 'gold', 'crimson']
rating_piechart = go.Pie(labels = label_rating,
values = size_rating,
marker = dict(colors = colors),
name = 'Alexa', hole = 0.3)
df = [rating_piechart]
layout = go.Layout(
title = 'Distribution of Ratings for Alexa')
fig = go.Figure(data = df,
layout = layout)
py.iplot(fig)
feedbacks = data['feedback'].value_counts()
label_feedback = feedbacks.index
size_feedback = feedbacks.values
colors = ['yellow', 'lightgreen']
feedback_piechart = go.Pie(labels = label_feedback,
values = size_feedback,
marker = dict(colors = colors),
name = 'Alexa', hole = 0.3)
df2 = [feedback_piechart]
layout = go.Layout(title = 'Distribution of Feedbacks for Alexa')
fig = go.Figure(data = df2,layout = layout)
py.iplot(fig)
from sklearn.feature_extraction.text import CountVectorizercv = CountVectorizer(stop_words = 'english')
words = cv.fit_transform(data.verified_reviews)
sum_words = words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
frequency = pd.DataFrame(words_freq, columns=['word', 'freq'])
plt.style.use('fivethirtyeight')
color = plt.cm.ocean(np.linspace(0, 1, 20))
frequency.head(20).plot(x='word', y='freq', kind='bar', figsize=(15, 6), color = color)
plt.title("Most Frequently Occuring Words - Top 20")
plt.show()
from wordcloud import WordCloud
wordcloud = WordCloud(background_color = 'lightcyan', width = 2000, height = 2000).generate_from_frequencies(dict(words_freq))
#plt.style.use('fivethirtyeight')
plt.figure(figsize=(10, 10))
plt.axis('off')
plt.imshow(wordcloud)
#plt.title("Vocabulary from Reviews", fontsize = 20)
plt.show()