Text Data Cleaning

This section was completed using Python.

Twitter Sentiment Analysis Data

We will now clean the Sentiment Analysis Data.

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import re
import string
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import shutil
from scipy.sparse import csr_matrix
from scipy.sparse import csr_array

# Reading in the data
df = pd.read_csv('../../../data/00-raw-data/Tweets.csv')
df_subset = df[['text','sentiment']]
df_subset = df_subset.dropna()
df_subset = df_subset.reset_index(drop=True)

print(df_subset.shape)
print(df_subset.columns)

(27480, 2)
Index(['text', 'sentiment'], dtype='object')

Now, we will go ahead and clean our tweets up by removing unneccesary characters and changes in capitalizations.

tweets=[]
y=[]
#ITERATE OVER ROWS
# for i in range(0,10):  
for i in range(0,len(df_subset)):
    # QUICKLY CLEAN TEXT
    keep="abcdefghijklmnopqrstuvwxyz "
    replace=".,!;"
    tmp=""
    for char in df_subset["text"][i].replace("<br />","").lower():
        if char in replace:
            tmp+=" "
        if char in keep:
            tmp+=char
    tmp=" ".join(tmp.split())
    tweets.append(tmp)
    # CONVERT STRINGS TO INT TAGS
    if(df_subset["sentiment"][i]=="positive"):
        y.append(1)
    if(df_subset["sentiment"][i]=="negative"):
        y.append(0)
    if(df_subset["sentiment"][i]=="neutral"):
        y.append(2)
    

    #PRINT FIRST COUPLE TWEETS
    if(i<3):
        print(i)
        print(df_subset["text"][i].replace("<br />",""),'\n')
        print(tmp)

0
 I`d have responded, if I were going 

id have responded if i were going
1
 Sooo SAD I will miss you here in San Diego!!! 

sooo sad i will miss you here in san diego
2
my boss is bullying me... 

my boss is bullying me

#DOUBLE CHECK SIZE
y=np.array(y)
print(len(tweets),len(y))

27480 27480

Now, we will get a CountVectorizer up and going so we further format our tweets for text classification later.

# I have to do 1000 features, nothing too much more, otherwise my kernel crashes
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(max_features=1000,stop_words="english")    
Xs  =  vectorizer.fit_transform(tweets)

Let’s format our data!

#CONVERT TO ONE-HOT VECTORS (can also be done with binary=true in CountVectorizer)
X=np.array(Xs.todense())

maxs=np.max(X,axis=0)

x = np.ceil(X/maxs)

vocab0 = vectorizer.vocabulary_

print(x.shape,y.shape)

(27480, 1000) (27480,)

#swap keys and values (value --> ley)
vocab1 = dict([(value, key) for key, value in vocab0.items()])

# CHECK VOCAB KEY-VALUE PAIRS
print(list(vocab1.keys())[0:10])
print(list(vocab1.values())[0:10])

[420, 329, 772, 706, 552, 432, 474, 81, 409, 60]
['id', 'going', 'sooo', 'sad', 'miss', 'interview', 'leave', 'bought', 'httpwww', 'best']

#RE-ORDER COLUMN SO IT IS SORTED FROM HIGH FREQ TERMS TO LOW 
df2=pd.DataFrame(x)
s = df2.sum(axis=0)
df2=df2[s.sort_values(ascending=False).index[:]]
print(df2.head())

   424  449  189  332  482  210  856  509  961  329  ...  405  555  532  427  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  0.0  0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

   657  187  749  87   166  893  
0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 1000 columns]

# RENAME COLUMNS 0,1,2,3 .. 
df2.columns = range(df2.columns.size)
print(df2.head())
print(df2.sum(axis=0))
x=df2.to_numpy()

   0    1    2    3    4    5    6    7    8    9    ...  990  991  992  993  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  0.0  0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

   994  995  996  997  998  999  
0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 1000 columns]
0      2909.0
1      2214.0
2      1964.0
3      1505.0
4      1295.0
        ...  
995      24.0
996      24.0
997      24.0
998      23.0
999      23.0
Length: 1000, dtype: float64

# REMAP DICTIONARY TO CORRESPOND TO NEW COLUMN NUMBERS
print()
i1=0
vocab2={}
for i2 in list(df2.columns):
    # print(i2)
    vocab2[i1]=vocab1[int(i2)]
    i1+=1

print(x.shape,y.shape)

(27480, 1000) (27480,)

Now we will export our cleaned data to our data folder which we can access later in the Naive Bayes text classification section.

import csv
# TWEETS
csv_file_path = "../../../data/01-modified-data/tweet.csv"
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # Write each row to the CSV file
    for row in x:
        writer.writerow(row)

NameError: name 'x' is not defined

# SENTIMENT
csv_file_path = "../../../data/01-modified-data/sentiment.csv"
np.savetxt(csv_file_path, y, delimiter=',')

Using Twitter API to Pull in HKJC Tweets

Now we will access the Twitter API in order to get Twitter data directly relating to HJKC, specfically the horse Golden Sixty. We will be using the tweepy package for Python.

import tweepy

bearer_token = 'AAAAAAAAAAAAAAAAAAAAADBsrQEAAAAAgwcAblxSw4ZubJQxyCOK5bbOh5w%3Dz6b9oNnbsJpfpLXZM0avhOqRtDJqHb9DdZi6vFZVWcfJgSCyvs'

client = tweepy.Client(bearer_token)

# Search Tweets
query = "Golden Sixty"
tweets = client.search_recent_tweets(query=query, max_results=100)

for tweet in tweets.data:
    print(tweet.text)
    if len(tweet.context_annotations) > 0:
        print(tweet.context_annotations)

df = pd.DataFrame(tweets.data)

RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @HongKong_Racing: 𝑻𝒉𝒆 𝑷𝒓𝒊𝒅𝒆 𝒐𝒇 𝑯𝒐𝒏𝒈 𝑲𝒐𝒏𝒈! 🏅6⃣0⃣

GOLDEN SIXTY sets out to make history with a third G1 Hong Kong Mile triumph.

#HKIR |…
RT @HongKong_Racing: 𝑻𝒉𝒆 𝑷𝒓𝒊𝒅𝒆 𝒐𝒇 𝑯𝒐𝒏𝒈 𝑲𝒐𝒏𝒈! 🏅6⃣0⃣

GOLDEN SIXTY sets out to make history with a third G1 Hong Kong Mile triumph.

#HKIR |…
'He's Done His Job': Hong Kong Superstar Golden Sixty, Now An 8-Year-Old, Readies For International Swansong - Horse Racing News | Paulick Report https://t.co/1NxZ35h75Q
@tantric_eden You can be my Santa! Also, you are the biggest golden hearted hoe I have had the pleasure to know. I met my first Ho sixty years ago and in that time I have known a few so I know this to be a fact! https://t.co/GdRCEdhmzb
RT @HKJC_Racing: He has over 550 Hong Kong wins and is Golden Sixty's regular rider... 🌟

Will @Vincenthocy add a first @LONGINES #IJC titl…
RT @HKJC_Racing: 🗣️ "Great training performance by John Size! Going in cold and winning the Hong Kong Mile!"

First-up after a 189-day brea…
Golden Sixty ❓️ https://t.co/4kIuFKXST6
He has over 550 Hong Kong wins and is Golden Sixty's regular rider... 🌟

Will @Vincenthocy add a first @LONGINES #IJC title to his record!? 

📍 Happy Valley, 6 Dec | #HKIR | #HKracing https://t.co/pC9kR4DsMR
RT @HongKong_Racing: A fairytale finale for GOLDEN SIXTY?

@gcunning12 looks ahead to Sunday's G1 HK Mile test for #HKRacing's most decorat…
@WHR @HollieDoyle1 @christo68914587 @netkeiba @RacingPost @RacenetTweets @SkyRacingAU @Racing @JRA_WorldRacing @BloodHorse @theTDN @gallop_keiba Just wish he would have competed against golden sixty....🤦
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @Hartley_026: 3度目の香港マイル制覇を狙うGolden Sixty。K.ルイ調教師は日本馬を警戒する他、「これが恐らく最後の香港国際競走」ともコメント。

こちらは香港ジョッキークラブが寄稿している記事なんですが、「日本での引退が濃厚」とあるので余生を日本で…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @SportingLife: "If their Golden oldie has one more combustible closer in his locker, then this year’s LONGINES Hong Kong Mile could be a…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @A_Evers: Unreal morning!!

Got picked up by Uber at 4:15. Tesla with the plate “OldSport”

Golden sixty comes out and just poses for me…
RT @HongKong_Racing: A fairytale finale for GOLDEN SIXTY?

@gcunning12 looks ahead to Sunday's G1 HK Mile test for #HKRacing's most decorat…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
RT @JainMumbaikar: Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward…
Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward is offered for they are gone forever.

- HIRAL

FUKRA PREACHES POSITIVITY
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
香港カップはRomantic Warrior香港馬にしてオーストラリア遠征でコックスプレートを勝ち国内ではGolden Sixtyに次ぐ圧倒的実力を持つ馬去年の香港カップは衝撃的な勝ち方で日本馬も数頭出るがこの馬を負かすのは大変だ
香港マイルはなんと言ってもGolden Sixtyのラストラン今回初めて休み明けで香港マイルに出るから不安はあるんだけどアドマイヤマーズを赤子扱いした3年前の衝撃が忘れられない有終の美を飾って欲しい
RT @HKJC_Racing: Ready for a return in Sunday’s Hong Kong Mile? 😏

Golden Sixty was buzzing this morning as he moved through his fast work……
RT @HKJC_Racing: 🗣️ "Great training performance by John Size! Going in cold and winning the Hong Kong Mile!"

First-up after a 189-day brea…
RT @SportingLife: "If their Golden oldie has one more combustible closer in his locker, then this year’s LONGINES Hong Kong Mile could be a…
🐴 Hong Kong Mile Tips 🐴

Can Golden Sixty reclaim his #HongKongMile title?

Get the latest horse racing odds for this #ShaTin showdown. 

➡️ https://t.co/2zyo2c3tNp https://t.co/beeTL6L4DH
@GOLDEN_SIXTY_60 テラタクさんそれ昨日のからあげよ〜！！！！！
でもからあげは美味しい！！！！！！ビールに合う！！！！！！
RT @gcunning12: Ageless Golden Boy or vulnerable Golden Oldie? Either way, Sunday’s Hong Kong Mile is another major test for a horse who ha…
"If their Golden oldie has one more combustible closer in his locker, then this year’s LONGINES Hong Kong Mile could be another timeless classic."

🏇🇭🇰 Our man in Hong Kong @GCunning12 wonders whether Golden Sixty can win a third LONGINES Hong Kong Mile at Sha Tin this weekend:
RT @SportingLife: 📝 "Although fans of Lucky Sweynesse, Golden Sixty and Romantic Warrior won’t need much encouragement to stay loyal, the t…
RT @HKJC_Racing: Ready for a return in Sunday’s Hong Kong Mile? 😏

Golden Sixty was buzzing this morning as he moved through his fast work……
RT @Racing: 2023 Cox Plate winner, Romantic Warrior, won a barrier trial at Sha Tin this morning with J-Mac in the saddle 😍

Golden Sixty f…
RT @HutchisHonkers: 🥹Imagine the scenes!

R S Dye heaps praise on trainer Francis Lui for his handling of Golden Sixty &amp; can't wait for the…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @gcunning12: Ageless Golden Boy or vulnerable Golden Oldie? Either way, Sunday’s Hong Kong Mile is another major test for a horse who ha…
RT @HKJC_Racing: Ready for a return in Sunday’s Hong Kong Mile? 😏

Golden Sixty was buzzing this morning as he moved through his fast work……
RT @HKJC_Racing: Start your #HKIR week right... 👇

Here's the champ, Golden Sixty! 💙🤍💛

@LONGINES | @Vincenthocy | #HKracing https://t.co/l…
RT @gcunning12: Ageless Golden Boy or vulnerable Golden Oldie? Either way, Sunday’s Hong Kong Mile is another major test for a horse who ha…
Ageless Golden Boy or vulnerable Golden Oldie? Either way, Sunday’s Hong Kong Mile is another major test for a horse who has been among the world’s elite for four years now. https://t.co/ABzfyA2I1z
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @Racing: Will Golden Sixty win at #HKIR on Sunday? 👇 https://t.co/4z90XphcAJ
RT @SportingLife: 📝 "Although fans of Lucky Sweynesse, Golden Sixty and Romantic Warrior won’t need much encouragement to stay loyal, the t…
Will Golden Sixty win at #HKIR on Sunday? 👇 https://t.co/4z90XphcAJ
RT @HongKong_Racing: 𝑻𝒉𝒆 𝑷𝒓𝒊𝒅𝒆 𝒐𝒇 𝑯𝒐𝒏𝒈 𝑲𝒐𝒏𝒈! 🏅6⃣0⃣

GOLDEN SIXTY sets out to make history with a third G1 Hong Kong Mile triumph.

#HKIR |…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @HongKong_Racing: GOLDEN SIXTY this morning… 🚀🚀

Bring on Sunday’s G1 Hong Kong Mile!

#HKIR | 10 Dec | #HKRacing

 https://t.co/SABMWW4…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
Do you give star Singapore galloper Lim’s Kosciuszko any chance of beating Champion galloper Golden Sixty in the Hong Kong Mile on Saturday?

https://t.co/V7j1O4cNIX
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
@JioCinema Lost, yesterday, somewhere between sunrise and sunset, two golden hours, each set with sixty diamond minutes. No reward is offered for they are gone forever.

#AnkitaIsTheBoss #AnkitaLokhande

Sg
@T_J_Carroll @HKJC_Racing Do the big 3 win? Lucky sweynese,golden sixty and romantic warrior? 11/10,evs and 5/4
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @HongKong_Racing: 𝑻𝒉𝒆 𝑷𝒓𝒊𝒅𝒆 𝒐𝒇 𝑯𝒐𝒏𝒈 𝑲𝒐𝒏𝒈! 🏅6⃣0⃣

GOLDEN SIXTY sets out to make history with a third G1 Hong Kong Mile triumph.

#HKIR |…
RT @HKJC_Racing: 🗣️ "Great training performance by John Size! Going in cold and winning the Hong Kong Mile!"

First-up after a 189-day brea…
🗣️ "Great training performance by John Size! Going in cold and winning the Hong Kong Mile!"

First-up after a 189-day break, Glorious Days scored a remarkable win at the 2013 @LONGINES #HKIR! 🔥

Golden Sixty returns after 224 days on Sunday... 😳 https://t.co/c6XRsuugRs
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @Punters: "He's done his job."

Superstar Golden Sixty readies for probable Hong Kong International Races swansong

Story 👉 https://t.co…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @A_Evers: Unreal morning!!

Got picked up by Uber at 4:15. Tesla with the plate “OldSport”

Golden sixty comes out and just poses for me…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @Hartley_026: 3度目の香港マイル制覇を狙うGolden Sixty。K.ルイ調教師は日本馬を警戒する他、「これが恐らく最後の香港国際競走」ともコメント。

こちらは香港ジョッキークラブが寄稿している記事なんですが、「日本での引退が濃厚」とあるので余生を日本で…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @Hartley_026: 3度目の香港マイル制覇を狙うGolden Sixty。K.ルイ調教師は日本馬を警戒する他、「これが恐らく最後の香港国際競走」ともコメント。

こちらは香港ジョッキークラブが寄稿している記事なんですが、「日本での引退が濃厚」とあるので余生を日本で…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @Hartley_026: 3度目の香港マイル制覇を狙うGolden Sixty。K.ルイ調教師は日本馬を警戒する他、「これが恐らく最後の香港国際競走」ともコメント。

こちらは香港ジョッキークラブが寄稿している記事なんですが、「日本での引退が濃厚」とあるので余生を日本で…
RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @Hartley_026: 3度目の香港マイル制覇を狙うGolden Sixty。K.ルイ調教師は日本馬を警戒する他、「これが恐らく最後の香港国際競走」ともコメント。

こちらは香港ジョッキークラブが寄稿している記事なんですが、「日本での引退が濃厚」とあるので余生を日本で…
RT @worldsbesthorse: Golden Sixty will try to win his third Longines Hong Kong Mile (G1) on Sunday. Read: https://t.co/uT6OIegN2n https://t…

Now we want to clean this tweet data up, so that it matches the format of the sentiment data which we will use to train a classifier. This cleaning process largely follows the same process as the cleaning for the sentiment data.

df_subset = df[['text','id']]
tweets_golden=[]
y=[]
#ITERATE OVER ROWS
# for i in range(0,10):  
for i in range(0,len(df_subset)):
    # QUICKLY CLEAN TEXT
    keep="abcdefghijklmnopqrstuvwxyz "
    replace=".,!;"
    tmp=""
    for char in df_subset["text"][i].replace("<br />","").lower():
        if char in replace:
            tmp+=" "
        if char in keep:
            tmp+=char
    tmp=" ".join(tmp.split())
    tweets_golden.append(tmp)

og_tweets = pd.DataFrame(tweets_golden)

csv_file_path = "../../../data/01-modified-data/goldensixty_uncleaned.csv"

og_tweets.to_csv(csv_file_path, index=False)

Xs_golden  =  vectorizer.fit_transform(tweets_golden)

X_golden=np.array(Xs_golden.todense())
maxs_golden=np.max(X_golden,axis=0)
x2 = np.ceil(X_golden/maxs_golden)
vocab0_2 = vectorizer.vocabulary_

#swap keys and values (value --> ley)
vocab1_2 = dict([(value, key) for key, value in vocab0_2.items()])

#RE-ORDER COLUMN SO IT IS SORTED FROM HIGH FREQ TERMS TO LOW 
df2_g=pd.DataFrame(x2)
s_g = df2_g.sum(axis=0)
df2_g=df2_g[s_g.sort_values(ascending=False).index[:]]
print(df2_g.head())

   66   166  124  83   190  116  47   93   239  125  ...  144  123  130  131  \
0  1.0  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  1.0   
4  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  1.0  0.0  0.0   

   132  136  140  142  143  122  
0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  1.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  1.0  0.0  0.0  0.0  

[5 rows x 240 columns]

# RENAME COLUMNS 0,1,2,3 .. 
df2_g.columns = range(df2_g.columns.size)
print(df2_g.head())
print(df2_g.sum(axis=0))
x_g=df2_g.to_numpy()

   0    1    2    3    4    5    6    7    8    9    ...  230  231  232  233  \
0  1.0  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  1.0   
4  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  1.0  0.0  0.0   

   234  235  236  237  238  239  
0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  1.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  1.0  0.0  0.0  0.0  

[5 rows x 240 columns]
0      92.0
1      82.0
2      45.0
3      30.0
4      29.0
       ... 
235     1.0
236     1.0
237     1.0
238     1.0
239     1.0
Length: 240, dtype: float64

Now we can export our data to our data folder.

csv_file_path = "../../../data/01-modified-data/goldensixty.csv"
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # Write each row to the CSV file
    for row in x_g:
        writer.writerow(row)