import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import shutil
from scipy.sparse import csr_matrix
from scipy.sparse import csr_array
Text Data Cleaning
This section was completed using Python.
Twitter Sentiment Analysis Data
We will now clean the Sentiment Analysis Data.
# Reading in the data
= pd.read_csv('../../../data/00-raw-data/Tweets.csv')
df = df[['text','sentiment']]
df_subset = df_subset.dropna()
df_subset = df_subset.reset_index(drop=True) df_subset
print(df_subset.shape)
print(df_subset.columns)
(27480, 2)
Index(['text', 'sentiment'], dtype='object')
Now, we will go ahead and clean our tweets up by removing unneccesary characters and changes in capitalizations.
=[]
tweets=[]
y#ITERATE OVER ROWS
# for i in range(0,10):
for i in range(0,len(df_subset)):
# QUICKLY CLEAN TEXT
="abcdefghijklmnopqrstuvwxyz "
keep=".,!;"
replace=""
tmpfor char in df_subset["text"][i].replace("<br />","").lower():
if char in replace:
+=" "
tmpif char in keep:
+=char
tmp=" ".join(tmp.split())
tmp
tweets.append(tmp)# CONVERT STRINGS TO INT TAGS
if(df_subset["sentiment"][i]=="positive"):
1)
y.append(if(df_subset["sentiment"][i]=="negative"):
0)
y.append(if(df_subset["sentiment"][i]=="neutral"):
2)
y.append(
#PRINT FIRST COUPLE TWEETS
if(i<3):
print(i)
print(df_subset["text"][i].replace("<br />",""),'\n')
print(tmp)
0
I`d have responded, if I were going
id have responded if i were going
1
Sooo SAD I will miss you here in San Diego!!!
sooo sad i will miss you here in san diego
2
my boss is bullying me...
my boss is bullying me
#DOUBLE CHECK SIZE
=np.array(y)
yprint(len(tweets),len(y))
27480 27480
Now, we will get a CountVectorizer up and going so we further format our tweets for text classification later.
# I have to do 1000 features, nothing too much more, otherwise my kernel crashes
from sklearn.feature_extraction.text import CountVectorizer
=CountVectorizer(max_features=1000,stop_words="english")
vectorizer= vectorizer.fit_transform(tweets) Xs
Let’s format our data!
#CONVERT TO ONE-HOT VECTORS (can also be done with binary=true in CountVectorizer)
=np.array(Xs.todense()) X
=np.max(X,axis=0) maxs
= np.ceil(X/maxs) x
= vectorizer.vocabulary_ vocab0
print(x.shape,y.shape)
(27480, 1000) (27480,)
#swap keys and values (value --> ley)
= dict([(value, key) for key, value in vocab0.items()]) vocab1
# CHECK VOCAB KEY-VALUE PAIRS
print(list(vocab1.keys())[0:10])
print(list(vocab1.values())[0:10])
[420, 329, 772, 706, 552, 432, 474, 81, 409, 60]
['id', 'going', 'sooo', 'sad', 'miss', 'interview', 'leave', 'bought', 'httpwww', 'best']
#RE-ORDER COLUMN SO IT IS SORTED FROM HIGH FREQ TERMS TO LOW
=pd.DataFrame(x)
df2= df2.sum(axis=0)
s =df2[s.sort_values(ascending=False).index[:]]
df2print(df2.head())
424 449 189 332 482 210 856 509 961 329 ... 405 555 532 427 \
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
657 187 749 87 166 893
0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0
[5 rows x 1000 columns]
# RENAME COLUMNS 0,1,2,3 ..
= range(df2.columns.size)
df2.columns print(df2.head())
print(df2.sum(axis=0))
=df2.to_numpy() x
0 1 2 3 4 5 6 7 8 9 ... 990 991 992 993 \
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
994 995 996 997 998 999
0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0
[5 rows x 1000 columns]
0 2909.0
1 2214.0
2 1964.0
3 1505.0
4 1295.0
...
995 24.0
996 24.0
997 24.0
998 23.0
999 23.0
Length: 1000, dtype: float64
# REMAP DICTIONARY TO CORRESPOND TO NEW COLUMN NUMBERS
print()
=0
i1={}
vocab2for i2 in list(df2.columns):
# print(i2)
=vocab1[int(i2)]
vocab2[i1]+=1 i1
print(x.shape,y.shape)
(27480, 1000) (27480,)
Now we will export our cleaned data to our data folder which we can access later in the Naive Bayes text classification section.
import csv
# TWEETS
= "../../../data/01-modified-data/tweet.csv"
csv_file_path with open(csv_file_path, mode='w', newline='') as file:
= csv.writer(file)
writer
# Write each row to the CSV file
for row in x:
writer.writerow(row)
NameError: name 'x' is not defined
# SENTIMENT
= "../../../data/01-modified-data/sentiment.csv"
csv_file_path =',')
np.savetxt(csv_file_path, y, delimiter
Using Twitter API to Pull in HKJC Tweets
Now we will access the Twitter API in order to get Twitter data directly relating to HJKC, specfically the horse Golden Sixty. We will be using the tweepy package for Python.
import tweepy
= 'AAAAAAAAAAAAAAAAAAAAADBsrQEAAAAAgwcAblxSw4ZubJQxyCOK5bbOh5w%3Dz6b9oNnbsJpfpLXZM0avhOqRtDJqHb9DdZi6vFZVWcfJgSCyvs'
bearer_token
= tweepy.Client(bearer_token)
client
# Search Tweets
= "Golden Sixty"
query = client.search_recent_tweets(query=query, max_results=100)
tweets
for tweet in tweets.data:
print(tweet.text)
if len(tweet.context_annotations) > 0:
print(tweet.context_annotations)
= pd.DataFrame(tweets.data) df
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @HongKong_Racing: 𝑻𝒉𝒆 𝑷𝒓𝒊𝒅𝒆 𝒐𝒇 𝑯𝒐𝒏𝒈 𝑲𝒐𝒏𝒈! 🏅6⃣0⃣
GOLDEN SIXTY sets out to make history with a third G1 Hong Kong Mile triumph.
#HKIR |…
RT @HongKong_Racing: 𝑻𝒉𝒆 𝑷𝒓𝒊𝒅𝒆 𝒐𝒇 𝑯𝒐𝒏𝒈 𝑲𝒐𝒏𝒈! 🏅6⃣0⃣
GOLDEN SIXTY sets out to make history with a third G1 Hong Kong Mile triumph.
#HKIR |…
'He's Done His Job': Hong Kong Superstar Golden Sixty, Now An 8-Year-Old, Readies For International Swansong - Horse Racing News | Paulick Report https://t.co/1NxZ35h75Q
@tantric_eden You can be my Santa! Also, you are the biggest golden hearted hoe I have had the pleasure to know. I met my first Ho sixty years ago and in that time I have known a few so I know this to be a fact! https://t.co/GdRCEdhmzb
RT @HKJC_Racing: He has over 550 Hong Kong wins and is Golden Sixty's regular rider... 🌟
Will @Vincenthocy add a first @LONGINES #IJC titl…
RT @HKJC_Racing: 🗣️ "Great training performance by John Size! Going in cold and winning the Hong Kong Mile!"
First-up after a 189-day brea…
Golden Sixty ❓️ https://t.co/4kIuFKXST6
He has over 550 Hong Kong wins and is Golden Sixty's regular rider... 🌟
Will @Vincenthocy add a first @LONGINES #IJC title to his record!?
📍 Happy Valley, 6 Dec | #HKIR | #HKracing https://t.co/pC9kR4DsMR
RT @HongKong_Racing: A fairytale finale for GOLDEN SIXTY?
@gcunning12 looks ahead to Sunday's G1 HK Mile test for #HKRacing's most decorat…
@WHR @HollieDoyle1 @christo68914587 @netkeiba @RacingPost @RacenetTweets @SkyRacingAU @Racing @JRA_WorldRacing @BloodHorse @theTDN @gallop_keiba Just wish he would have competed against golden sixty....🤦
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @Hartley_026: 3度目の香港マイル制覇を狙うGolden Sixty。K.ルイ調教師は日本馬を警戒する他、「これが恐らく最後の香港国際競走」ともコメント。
こちらは香港ジョッキークラブが寄稿している記事なんですが、「日本での引退が濃厚」とあるので余生を日本で…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @SportingLife: "If their Golden oldie has one more combustible closer in his locker, then this year’s LONGINES Hong Kong Mile could be a…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @A_Evers: Unreal morning!!
Got picked up by Uber at 4:15. Tesla with the plate “OldSport”
Golden sixty comes out and just poses for me…
RT @HongKong_Racing: A fairytale finale for GOLDEN SIXTY?
@gcunning12 looks ahead to Sunday's G1 HK Mile test for #HKRacing's most decorat…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward is offered for they are gone forever.
- HIRAL
FUKRA PREACHES POSITIVITY
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
香港カップはRomantic Warrior香港馬にしてオーストラリア遠征でコックスプレートを勝ち国内ではGolden Sixtyに次ぐ圧倒的実力を持つ馬去年の香港カップは衝撃的な勝ち方で日本馬も数頭出るがこの馬を負かすのは大変だ
香港マイルはなんと言ってもGolden Sixtyのラストラン今回初めて休み明けで香港マイルに出るから不安はあるんだけどアドマイヤマーズを赤子扱いした3年前の衝撃が忘れられない有終の美を飾って欲しい
RT @HKJC_Racing: Ready for a return in Sunday’s Hong Kong Mile? 😏
Golden Sixty was buzzing this morning as he moved through his fast work……
RT @HKJC_Racing: 🗣️ "Great training performance by John Size! Going in cold and winning the Hong Kong Mile!"
First-up after a 189-day brea…
RT @SportingLife: "If their Golden oldie has one more combustible closer in his locker, then this year’s LONGINES Hong Kong Mile could be a…
🐴 Hong Kong Mile Tips 🐴
Can Golden Sixty reclaim his #HongKongMile title?
Get the latest horse racing odds for this #ShaTin showdown.
➡️ https://t.co/2zyo2c3tNp https://t.co/beeTL6L4DH
@GOLDEN_SIXTY_60 テラタクさんそれ昨日のからあげよ〜!!!!!
でもからあげは美味しい!!!!!!ビールに合う!!!!!!
RT @gcunning12: Ageless Golden Boy or vulnerable Golden Oldie? Either way, Sunday’s Hong Kong Mile is another major test for a horse who ha…
"If their Golden oldie has one more combustible closer in his locker, then this year’s LONGINES Hong Kong Mile could be another timeless classic."
🏇🇭🇰 Our man in Hong Kong @GCunning12 wonders whether Golden Sixty can win a third LONGINES Hong Kong Mile at Sha Tin this weekend:
RT @SportingLife: 📝 "Although fans of Lucky Sweynesse, Golden Sixty and Romantic Warrior won’t need much encouragement to stay loyal, the t…
RT @HKJC_Racing: Ready for a return in Sunday’s Hong Kong Mile? 😏
Golden Sixty was buzzing this morning as he moved through his fast work……
RT @Racing: 2023 Cox Plate winner, Romantic Warrior, won a barrier trial at Sha Tin this morning with J-Mac in the saddle 😍
Golden Sixty f…
RT @HutchisHonkers: 🥹Imagine the scenes!
R S Dye heaps praise on trainer Francis Lui for his handling of Golden Sixty & can't wait for the…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @gcunning12: Ageless Golden Boy or vulnerable Golden Oldie? Either way, Sunday’s Hong Kong Mile is another major test for a horse who ha…
RT @HKJC_Racing: Ready for a return in Sunday’s Hong Kong Mile? 😏
Golden Sixty was buzzing this morning as he moved through his fast work……
RT @HKJC_Racing: Start your #HKIR week right... 👇
Here's the champ, Golden Sixty! 💙🤍💛
@LONGINES | @Vincenthocy | #HKracing https://t.co/l…
RT @gcunning12: Ageless Golden Boy or vulnerable Golden Oldie? Either way, Sunday’s Hong Kong Mile is another major test for a horse who ha…
Ageless Golden Boy or vulnerable Golden Oldie? Either way, Sunday’s Hong Kong Mile is another major test for a horse who has been among the world’s elite for four years now. https://t.co/ABzfyA2I1z
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @Racing: Will Golden Sixty win at #HKIR on Sunday? 👇 https://t.co/4z90XphcAJ
RT @SportingLife: 📝 "Although fans of Lucky Sweynesse, Golden Sixty and Romantic Warrior won’t need much encouragement to stay loyal, the t…
Will Golden Sixty win at #HKIR on Sunday? 👇 https://t.co/4z90XphcAJ
RT @HongKong_Racing: 𝑻𝒉𝒆 𝑷𝒓𝒊𝒅𝒆 𝒐𝒇 𝑯𝒐𝒏𝒈 𝑲𝒐𝒏𝒈! 🏅6⃣0⃣
GOLDEN SIXTY sets out to make history with a third G1 Hong Kong Mile triumph.
#HKIR |…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @HongKong_Racing: GOLDEN SIXTY this morning… 🚀🚀
Bring on Sunday’s G1 Hong Kong Mile!
#HKIR | 10 Dec | #HKRacing
https://t.co/SABMWW4…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
Do you give star Singapore galloper Lim’s Kosciuszko any chance of beating Champion galloper Golden Sixty in the Hong Kong Mile on Saturday?
https://t.co/V7j1O4cNIX
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
@JioCinema Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward is offered for they are gone forever.
#AnkitaIsTheBoss #AnkitaLokhande
Sg
@T_J_Carroll @HKJC_Racing Do the big 3 win? Lucky sweynese,golden sixty and romantic warrior? 11/10,evs and 5/4
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @HongKong_Racing: 𝑻𝒉𝒆 𝑷𝒓𝒊𝒅𝒆 𝒐𝒇 𝑯𝒐𝒏𝒈 𝑲𝒐𝒏𝒈! 🏅6⃣0⃣
GOLDEN SIXTY sets out to make history with a third G1 Hong Kong Mile triumph.
#HKIR |…
RT @HKJC_Racing: 🗣️ "Great training performance by John Size! Going in cold and winning the Hong Kong Mile!"
First-up after a 189-day brea…
🗣️ "Great training performance by John Size! Going in cold and winning the Hong Kong Mile!"
First-up after a 189-day break, Glorious Days scored a remarkable win at the 2013 @LONGINES #HKIR! 🔥
Golden Sixty returns after 224 days on Sunday... 😳 https://t.co/c6XRsuugRs
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @Punters: "He's done his job."
Superstar Golden Sixty readies for probable Hong Kong International Races swansong
Story 👉 https://t.co…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @A_Evers: Unreal morning!!
Got picked up by Uber at 4:15. Tesla with the plate “OldSport”
Golden sixty comes out and just poses for me…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @Hartley_026: 3度目の香港マイル制覇を狙うGolden Sixty。K.ルイ調教師は日本馬を警戒する他、「これが恐らく最後の香港国際競走」ともコメント。
こちらは香港ジョッキークラブが寄稿している記事なんですが、「日本での引退が濃厚」とあるので余生を日本で…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @Hartley_026: 3度目の香港マイル制覇を狙うGolden Sixty。K.ルイ調教師は日本馬を警戒する他、「これが恐らく最後の香港国際競走」ともコメント。
こちらは香港ジョッキークラブが寄稿している記事なんですが、「日本での引退が濃厚」とあるので余生を日本で…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @Hartley_026: 3度目の香港マイル制覇を狙うGolden Sixty。K.ルイ調教師は日本馬を警戒する他、「これが恐らく最後の香港国際競走」ともコメント。
こちらは香港ジョッキークラブが寄稿している記事なんですが、「日本での引退が濃厚」とあるので余生を日本で…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍
#ナミュール 🎌 | #競馬 | @LONG…
RT @Hartley_026: 3度目の香港マイル制覇を狙うGolden Sixty。K.ルイ調教師は日本馬を警戒する他、「これが恐らく最後の香港国際競走」ともコメント。
こちらは香港ジョッキークラブが寄稿している記事なんですが、「日本での引退が濃厚」とあるので余生を日本で…
RT @worldsbesthorse: Golden Sixty will try to win his third Longines Hong Kong Mile (G1) on Sunday. Read: https://t.co/uT6OIegN2n https://t…
Now we want to clean this tweet data up, so that it matches the format of the sentiment data which we will use to train a classifier. This cleaning process largely follows the same process as the cleaning for the sentiment data.
= df[['text','id']]
df_subset =[]
tweets_golden=[]
y#ITERATE OVER ROWS
# for i in range(0,10):
for i in range(0,len(df_subset)):
# QUICKLY CLEAN TEXT
="abcdefghijklmnopqrstuvwxyz "
keep=".,!;"
replace=""
tmpfor char in df_subset["text"][i].replace("<br />","").lower():
if char in replace:
+=" "
tmpif char in keep:
+=char
tmp=" ".join(tmp.split())
tmp
tweets_golden.append(tmp)
= pd.DataFrame(tweets_golden)
og_tweets
= "../../../data/01-modified-data/goldensixty_uncleaned.csv"
csv_file_path
=False) og_tweets.to_csv(csv_file_path, index
= vectorizer.fit_transform(tweets_golden) Xs_golden
=np.array(Xs_golden.todense())
X_golden=np.max(X_golden,axis=0)
maxs_golden= np.ceil(X_golden/maxs_golden)
x2 = vectorizer.vocabulary_ vocab0_2
#swap keys and values (value --> ley)
= dict([(value, key) for key, value in vocab0_2.items()]) vocab1_2
#RE-ORDER COLUMN SO IT IS SORTED FROM HIGH FREQ TERMS TO LOW
=pd.DataFrame(x2)
df2_g= df2_g.sum(axis=0)
s_g =df2_g[s_g.sort_values(ascending=False).index[:]]
df2_gprint(df2_g.head())
66 166 124 83 190 116 47 93 239 125 ... 144 123 130 131 \
0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
1 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
2 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
3 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0
4 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0
132 136 140 142 143 122
0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 1.0 0.0 0.0 0.0 0.0
4 0.0 0.0 1.0 0.0 0.0 0.0
[5 rows x 240 columns]
# RENAME COLUMNS 0,1,2,3 ..
= range(df2_g.columns.size)
df2_g.columns print(df2_g.head())
print(df2_g.sum(axis=0))
=df2_g.to_numpy() x_g
0 1 2 3 4 5 6 7 8 9 ... 230 231 232 233 \
0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
1 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
2 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
3 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0
4 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0
234 235 236 237 238 239
0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 1.0 0.0 0.0 0.0 0.0
4 0.0 0.0 1.0 0.0 0.0 0.0
[5 rows x 240 columns]
0 92.0
1 82.0
2 45.0
3 30.0
4 29.0
...
235 1.0
236 1.0
237 1.0
238 1.0
239 1.0
Length: 240, dtype: float64
Now we can export our data to our data folder.
= "../../../data/01-modified-data/goldensixty.csv"
csv_file_path with open(csv_file_path, mode='w', newline='') as file:
= csv.writer(file)
writer
# Write each row to the CSV file
for row in x_g:
writer.writerow(row)