!pip install bertopic gensim pyLDAvis ipykernel
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from bertopic import BERTopic
import torch
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
nltk.download("stopwords")
nltk.download("punkt_tab")
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip. [nltk_data] Downloading package punkt_tab to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt_tab.zip.
True
googleReview = pd.read_excel("Google_12_months.xlsx")
trustPilot = pd.read_excel("Trustpilot_12_months.xlsx")
googleReview.rename(columns= {"Club's Name": "Location Name", "Overall Score": "Review Score"}, inplace = True)
trustPilot.rename(columns= {"Review Content": "Comment", "Review Stars": "Review Score"}, inplace = True)
def data_preprocessing(data, col_name):
data = data.copy()
print(data.shape)
stop_words = set(stopwords.words("english"))
data = data.dropna(subset = [col_name])
classifier = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection", truncation=True)
data[col_name] = data[col_name].str.replace(r'\d+', '', regex = True)
def process_text(text):
lang_result = classifier(text)
if lang_result[0]["label"] == "en":
return " ".join([word for word in word_tokenize(text.lower()) if word not in stop_words and word.isalpha()])
return ""
data[col_name] = data[col_name].apply(process_text)
return data
col_name = "Comment"
print("Google Reviews")
clean_google = data_preprocessing(googleReview, col_name)
print("Trustpilot Reviews")
clean_pilot = data_preprocessing(trustPilot, col_name)
Google Reviews (23250, 7)
config.json: 0%| | 0.00/1.42k [00:00<?, ?B/s]
model.safetensors: 0%| | 0.00/1.11G [00:00<?, ?B/s]
tokenizer_config.json: 0%| | 0.00/502 [00:00<?, ?B/s]
(…)6de7952f7c11ab059eca145a0a727afce0db2865: 0%| | 0.00/5.07M [00:00<?, ?B/s]
tokenizer.json: 0%| | 0.00/9.08M [00:00<?, ?B/s]
special_tokens_map.json: 0%| | 0.00/239 [00:00<?, ?B/s]
Device set to use cuda:0 You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Trustpilot Reviews (16673, 15)
Device set to use cuda:0
def wordcloud(cloud):
custom_stopwords = {"gym", "puregym"}
stopwords = STOPWORDS.union(custom_stopwords)
for title, i in cloud.items():
wordcloud = WordCloud(stopwords= stopwords, width=800, height=400, background_color='white').generate(" ".join(i))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f"The wordcloud for: {title}")
plt.show()
def analysis(data):
neg_reviews = {}
for name, df in data.items():
all_words = [word for comment in df['Comment'] for word in word_tokenize(comment)]
print(f"{name} have {df['Location Name'].str.lower().nunique()} unique location")
print(f"{name} frequency distribution is {FreqDist(all_words)}")
print(f"{name} has this shape {df.shape}")
common_words = FreqDist(all_words).most_common(10)
neg_reviews[name.split()[0]] = df[df["Review Score"] < 3]
neg_words = [word for comment in neg_reviews[name.split()[0]]['Comment'] for word in word_tokenize(comment)]
print(f"{name} frequency distribution of negative words is {FreqDist(neg_words)}")
print(f"{name} negative reviews has this shape {neg_reviews[name.split()[0]].shape}")
words, counts = zip(*common_words)
plt.figure(figsize=(10, 5))
plt.bar(words, counts, color='skyblue')
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title(f"Top 10 Most Common Words")
plt.show()
cloud = {f"All Reviews for {name.split()[0]}": all_words, f"Negative Reviews for {name.split()[0]}":neg_words}
wordcloud(cloud)
return neg_reviews
datasets = {"google review": clean_google, "trustpilot reviews": clean_pilot}
neg_dataset = analysis(datasets)
google review have 512 unique location google review frequency distribution is <FreqDist with 13492 samples and 241042 outcomes> google review has this shape (13898, 7) google review frequency distribution of negative words is <FreqDist with 8414 samples and 81163 outcomes> google review negative reviews has this shape (2785, 7)
trustpilot reviews have 374 unique location trustpilot reviews frequency distribution is <FreqDist with 12859 samples and 277004 outcomes> trustpilot reviews has this shape (16673, 15) trustpilot reviews frequency distribution of negative words is <FreqDist with 8410 samples and 95444 outcomes> trustpilot reviews negative reviews has this shape (3543, 15)
common_locations = len(set(clean_google["Location Name"].str.lower()).intersection(set(clean_pilot["Location Name"].str.lower())))
print(f'Number of common locations: {common_locations}')
Number of common locations: 312
google_neg = None
trustpilot_neg = None
for name, df in neg_dataset.items():
if name == "google":
google_neg = df
else:
trustpilot_neg = df
common_google = google_neg[google_neg["Location Name"].isin(trustpilot_neg["Location Name"])][["Comment","Location Name"]]
common_trustpilot = trustpilot_neg[trustpilot_neg["Location Name"].isin(common_google["Location Name"])][["Comment", "Location Name"]]
merged_data = pd.concat([common_google, common_trustpilot], ignore_index=True)
merged_data.shape
(3888, 2)
def bert_modelling(data):
data = data.copy()
merged_neg_clean = data["Comment"].tolist()
model = BERTopic(verbose=True)
model.fit(merged_neg_clean)
topics, probabilities = model.transform(merged_neg_clean)
data["topic"] = topics
return model, data, topics, probabilities
model, topics_data, topics, probabilities = bert_modelling(merged_data)
2025-04-01 03:01:29,078 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 0%| | 0/122 [00:00<?, ?it/s]
2025-04-01 03:01:31,289 - BERTopic - Embedding - Completed ✓ 2025-04-01 03:01:31,290 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm 2025-04-01 03:01:54,395 - BERTopic - Dimensionality - Completed ✓ 2025-04-01 03:01:54,396 - BERTopic - Cluster - Start clustering the reduced embeddings 2025-04-01 03:01:54,540 - BERTopic - Cluster - Completed ✓ 2025-04-01 03:01:54,544 - BERTopic - Representation - Fine-tuning topics using representation models. 2025-04-01 03:01:54,694 - BERTopic - Representation - Completed ✓
Batches: 0%| | 0/122 [00:00<?, ?it/s]
2025-04-01 03:01:55,924 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings. 2025-04-01 03:01:55,936 - BERTopic - Dimensionality - Completed ✓ 2025-04-01 03:01:55,937 - BERTopic - Clustering - Approximating new points with `hdbscan_model` 2025-04-01 03:01:56,069 - BERTopic - Cluster - Completed ✓
def bert_visuals(bert_model):
print(f"Total number of topics: {len(bert_model.get_topics())}")
topic_freq = bert_model.get_topic_freq()
print(f"Outliers:\n {topic_freq[topic_freq['Topic'] == -1]} \n")
print(f"Top topics along with their document frequencies: \n {topic_freq[topic_freq['Topic'] != -1].head(10)} \n")
top_2_topics = topic_freq[topic_freq['Topic'] != -1].head(2)["Topic"].tolist()
for tp in top_2_topics:
print(f"Topic {tp} has the following top words: \n {bert_model.get_topic(tp)} \n")
fig1 = bert_model.visualize_topics()
fig1.show()
fig2 = bert_model.visualize_barchart()
fig2.show()
fig3 = bert_model.visualize_heatmap()
fig3.show()
bert_visuals(model)
Total number of topics: 50 Outliers: Topic Count 4 -1 1628 Top topics along with their document frequencies: Topic Count 2 0 195 11 1 150 10 2 142 3 3 137 12 4 123 20 5 115 27 6 83 7 7 71 8 8 58 41 9 57 Topic 0 has the following top words: [('air', 0.06535853622220478), ('conditioning', 0.039109409267406005), ('hot', 0.0341153751577972), ('aircon', 0.030904603329895447), ('con', 0.02539572030351039), ('ac', 0.02335473164547456), ('summer', 0.022840948850730125), ('heat', 0.022368245534383064), ('working', 0.018771570123393193), ('temperature', 0.017792814503912378)] Topic 1 has the following top words: [('rude', 0.021586991254173425), ('manager', 0.02069975537194293), ('staff', 0.01869429753354046), ('member', 0.01668603498125788), ('gym', 0.01588568912675759), ('women', 0.012400406597117608), ('personal', 0.011682768418990724), ('reviews', 0.011341051914656263), ('men', 0.010887589494607062), ('told', 0.01059952186619874)]
top_10_topic_Location = topics_data[(topics_data["topic"] > -1) & (topics_data["topic"] < 10)]
top_10_topic_Location[["Location Name", "topic"]].value_counts().head(10)
count | ||
---|---|---|
Location Name | topic | |
Leicester Walnut Street | 0 | 28 |
Nottingham Colwick | 5 | 16 |
Billericay | 5 | 13 |
London Stratford | 2 | 12 |
Aylesbury | 1 | 10 |
Walsall Crown Wharf | 5 | 10 |
Reading Calcot | 0 | 9 |
Norwich Riverside | 1 | 8 |
Burnham | 2 | 8 |
Paisley | 0 | 7 |
print(f"google top 20 locations with the most negative reviews {common_google['Location Name'].value_counts().head(20)}")
print("\n")
print(f"Trustpilot top 20 locations with the most negative reviews {common_trustpilot['Location Name'].value_counts().head(20)}")
google top 20 locations with the most negative reviews Location Name London Stratford 59 London Canary Wharf 26 London Enfield 25 London Swiss Cottage 24 Birmingham City Centre 21 London Leytonstone 21 New Barnet 20 Wakefield 19 Bradford Thornbury 19 Walsall Crown Wharf 18 London Hoxton 18 London Seven Sisters 18 London Hayes 17 Manchester Exchange Quay 17 Sutton Times Square 16 London Bermondsey 16 Nottingham Colwick 16 London Piccadilly 15 Leeds City Centre North 15 London Muswell Hill 15 Name: count, dtype: int64 Trustpilot top 20 locations with the most negative reviews Location Name Leicester Walnut Street 50 London Enfield 23 London Stratford 22 Burnham 20 London Bermondsey 18 York 16 Maidenhead 16 London Seven Sisters 16 London Finchley 16 London Hayes 16 London Swiss Cottage 15 London Hammersmith Palais 15 Northwich 15 London Bromley 15 Dudley Tipton 14 Watford Waterfields 14 Basildon 14 Bradford Thornbury 14 Telford 14 Birmingham City Centre 14 Name: count, dtype: int64
google_top_20 = common_google['Location Name'].value_counts().head(20).reset_index()
google_top_20.columns = ['Location Name', 'Google Negative Reviews']
trustpilot_top_20 = common_trustpilot['Location Name'].value_counts().head(20).reset_index()
trustpilot_top_20.columns = ['Location Name', 'Trustpilot Negative Reviews']
merged_top_20 = pd.merge(google_top_20, trustpilot_top_20, on="Location Name", how="inner")
print("Common Locations with Negative Reviews from Both Google and Trustpilot:")
merged_top_20
Common Locations with Negative Reviews from Both Google and Trustpilot:
Location Name | Google Negative Reviews | Trustpilot Negative Reviews | |
---|---|---|---|
0 | London Stratford | 59 | 22 |
1 | London Enfield | 25 | 23 |
2 | London Swiss Cottage | 24 | 15 |
3 | Birmingham City Centre | 21 | 14 |
4 | Bradford Thornbury | 19 | 14 |
5 | London Seven Sisters | 18 | 16 |
6 | London Hayes | 17 | 16 |
7 | London Bermondsey | 16 | 18 |
google_neg_loc = google_neg["Location Name"].value_counts().reset_index()
pilot_neg_loc = trustpilot_neg["Location Name"].value_counts().reset_index()
merged_count = pd.merge(google_neg_loc, pilot_neg_loc, on='Location Name', how='outer')
merged_count["count_x"] = merged_count["count_x"].fillna(0)
merged_count["count_y"] = merged_count["count_y"].fillna(0)
merged_count["Total by loaction"] = merged_count["count_x"] + merged_count["count_y"]
merged_count = merged_count.sort_values(by='Total by loaction', ascending= False)
merged_count
Location Name | count_x | count_y | Total by loaction | |
---|---|---|---|---|
362 | London Stratford | 59.0 | 22.0 | 81.0 |
289 | Leicester Walnut Street | 11.0 | 50.0 | 61.0 |
326 | London Enfield | 25.0 | 23.0 | 48.0 |
0 | 345 | 0.0 | 45.0 | 45.0 |
364 | London Swiss Cottage | 24.0 | 15.0 | 39.0 |
... | ... | ... | ... | ... |
278 | La Chaux de Fonds | 1.0 | 0.0 | 1.0 |
59 | 247 - Esbjerg, Randersvej | 1.0 | 0.0 | 1.0 |
61 | 249 - Ikast, Strøget | 1.0 | 0.0 | 1.0 |
64 | 255 - Kalundborg, Sct. Jørgensbjerg | 1.0 | 0.0 | 1.0 |
12 | 124 - Kbh. NV., Lygten | 1.0 | 0.0 | 1.0 |
539 rows × 4 columns
google_top_30 = google_neg[google_neg["Location Name"].isin(merged_count["Location Name"].head(30))][["Comment", "Location Name"]]
pilot_top_30 = trustpilot_neg[trustpilot_neg["Location Name"].isin(merged_count["Location Name"].head(30))][["Comment", "Location Name"]]
top_30_wordcloud = {"Google": google_top_30["Comment"], "Trustpilot": pilot_top_30["Comment"]}
wordcloud(top_30_wordcloud)
top_30_comments = pd.concat([google_top_30, pilot_top_30], ignore_index = True)
df_duplicated = pd.concat([top_30_comments, top_30_comments], ignore_index=True)
model_30, top_30_data, topic_30, probabilities_30 = bert_modelling(df_duplicated)
2025-04-01 04:03:29,736 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 0%| | 0/62 [00:00<?, ?it/s]
2025-04-01 04:03:30,963 - BERTopic - Embedding - Completed ✓ 2025-04-01 04:03:30,964 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm 2025-04-01 04:03:38,200 - BERTopic - Dimensionality - Completed ✓ 2025-04-01 04:03:38,201 - BERTopic - Cluster - Start clustering the reduced embeddings 2025-04-01 04:03:38,270 - BERTopic - Cluster - Completed ✓ 2025-04-01 04:03:38,274 - BERTopic - Representation - Fine-tuning topics using representation models. 2025-04-01 04:03:38,389 - BERTopic - Representation - Completed ✓
Batches: 0%| | 0/62 [00:00<?, ?it/s]
2025-04-01 04:03:39,148 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings. 2025-04-01 04:03:39,155 - BERTopic - Dimensionality - Completed ✓ 2025-04-01 04:03:39,155 - BERTopic - Clustering - Approximating new points with `hdbscan_model` 2025-04-01 04:03:39,221 - BERTopic - Cluster - Completed ✓
bert_visuals(model_30)
Total number of topics: 70 Outliers: Topic Count 1 -1 304 Top topics along with their document frequencies: Topic Count 6 0 88 17 1 72 4 2 68 9 3 62 42 4 58 22 5 42 0 6 42 14 7 38 68 8 36 10 9 36 Topic 0 has the following top words: [('smell', 0.03209920574772303), ('smelly', 0.030228486724505444), ('air', 0.029442284051825408), ('disgusting', 0.024554686369513766), ('aircon', 0.02451718576468661), ('worst', 0.023792796209157436), ('stinks', 0.02200092847548957), ('ever', 0.021161352942561253), ('ventilation', 0.0196847984820827), ('urine', 0.018953204835633626)] Topic 1 has the following top words: [('pin', 0.07810738736606905), ('pass', 0.06562447771361557), ('code', 0.05415072747804314), ('day', 0.05070777212915479), ('sent', 0.041654405752340876), ('app', 0.039985745448062736), ('access', 0.038156671096345636), ('bought', 0.03340465626022348), ('work', 0.030147532664350097), ('number', 0.029989309086047054)]
(Note: Due to a lack of data, the top 30 location data has been duplicated to ensure enough data for BERTopic and the intertopic distance map.)
model_name = "bhadresh-savani/bert-base-uncased-emotion"
classifier = pipeline("text-classification", model = model_name, truncation=True)
config.json: 0%| | 0.00/935 [00:00<?, ?B/s]
model.safetensors: 0%| | 0.00/438M [00:00<?, ?B/s]
tokenizer_config.json: 0%| | 0.00/285 [00:00<?, ?B/s]
vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]
tokenizer.json: 0%| | 0.00/466k [00:00<?, ?B/s]
special_tokens_map.json: 0%| | 0.00/112 [00:00<?, ?B/s]
Device set to use cuda:0
google_neg_emotion = google_neg[["Comment", "Location Name"]].copy()
trustpilot_neg_emotion = trustpilot_neg[["Comment", "Location Name"]].copy()
def emotion_analysis(text):
result = classifier(text)[0]["label"]
return result
google_neg_emotion["emotion"] = google_neg_emotion["Comment"].apply(emotion_analysis)
trustpilot_neg_emotion["emotion"] = trustpilot_neg_emotion["Comment"].apply(emotion_analysis)
def emotion_chart(values):
for name, i in values.items():
plt.bar(i.index, i.values)
plt.xlabel("Emotion")
plt.ylabel("Distribution")
plt.title(f"Distribution for all negative reviews in {name}")
plt.show()
emotion_count_google = google_neg_emotion["emotion"].value_counts()
emotion_count_pilot = trustpilot_neg_emotion["emotion"].value_counts()
emotion_chart({"google": emotion_count_google, "Trustpilot" : emotion_count_pilot})
neg_only= pd.concat([google_neg_emotion[google_neg_emotion["emotion"]== "anger"], trustpilot_neg_emotion[trustpilot_neg_emotion["emotion"]== "anger"]], ignore_index = True)
model_anger, anger_data, topic_anger, probabilities_anger = bert_modelling(neg_only)
2025-04-01 03:13:58,891 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 0%| | 0/76 [00:00<?, ?it/s]
2025-04-01 03:14:00,256 - BERTopic - Embedding - Completed ✓ 2025-04-01 03:14:00,257 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm 2025-04-01 03:14:12,737 - BERTopic - Dimensionality - Completed ✓ 2025-04-01 03:14:12,739 - BERTopic - Cluster - Start clustering the reduced embeddings 2025-04-01 03:14:12,824 - BERTopic - Cluster - Completed ✓ 2025-04-01 03:14:12,827 - BERTopic - Representation - Fine-tuning topics using representation models. 2025-04-01 03:14:12,916 - BERTopic - Representation - Completed ✓
Batches: 0%| | 0/76 [00:00<?, ?it/s]
2025-04-01 03:14:13,683 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings. 2025-04-01 03:14:13,691 - BERTopic - Dimensionality - Completed ✓ 2025-04-01 03:14:13,692 - BERTopic - Clustering - Approximating new points with `hdbscan_model` 2025-04-01 03:14:13,776 - BERTopic - Cluster - Completed ✓
bert_visuals(model_anger)
Total number of topics: 36 Outliers: Topic Count 0 -1 826 Top topics along with their document frequencies: Topic Count 1 0 308 2 1 107 3 2 83 5 3 75 21 4 69 16 5 66 4 6 57 25 7 55 9 8 51 22 9 48 Topic 0 has the following top words: [('nah', 3.619248420447182), ('know', 1.4617849136528802), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)] Topic 1 has the following top words: [('rude', 0.03431141872944007), ('staff', 0.02840506990443558), ('member', 0.02520934244959189), ('manager', 0.023528607119674173), ('gym', 0.02139922651380406), ('personal', 0.018953258511792522), ('aggressive', 0.016668275528548066), ('reviews', 0.016001951029404715), ('puregym', 0.015331679815607645), ('one', 0.01292573313094035)]
neg_only[["Location Name", "emotion"]].value_counts().head(3)
count | ||
---|---|---|
Location Name | emotion | |
London Stratford | anger | 37 |
Leicester Walnut Street | anger | 22 |
London Enfield | anger | 19 |
phi_neg = pd.concat([google_neg[["Comment", "Location Name"]], trustpilot_neg[["Comment", "Location Name"]]], ignore_index = True)
torch.random.manual_seed(0)
<torch._C.Generator at 0x7e26d6572930>
model_path = "microsoft/Phi-4-mini-instruct"
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="cuda",
torch_dtype="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
config.json: 0%| | 0.00/2.50k [00:00<?, ?B/s]
configuration_phi3.py: 0%| | 0.00/10.9k [00:00<?, ?B/s]
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-mini-instruct: - configuration_phi3.py . Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
modeling_phi3.py: 0%| | 0.00/54.3k [00:00<?, ?B/s]
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-mini-instruct: - modeling_phi3.py . Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
model.safetensors.index.json: 0%| | 0.00/16.3k [00:00<?, ?B/s]
Fetching 2 files: 0%| | 0/2 [00:00<?, ?it/s]
model-00001-of-00002.safetensors: 0%| | 0.00/4.90G [00:00<?, ?B/s]
model-00002-of-00002.safetensors: 0%| | 0.00/2.77G [00:00<?, ?B/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
generation_config.json: 0%| | 0.00/168 [00:00<?, ?B/s]
tokenizer_config.json: 0%| | 0.00/2.93k [00:00<?, ?B/s]
vocab.json: 0%| | 0.00/3.91M [00:00<?, ?B/s]
merges.txt: 0%| | 0.00/2.42M [00:00<?, ?B/s]
tokenizer.json: 0%| | 0.00/15.5M [00:00<?, ?B/s]
added_tokens.json: 0%| | 0.00/249 [00:00<?, ?B/s]
special_tokens_map.json: 0%| | 0.00/587 [00:00<?, ?B/s]
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
)
Device set to use cuda
generation_args = {
"max_new_tokens": 500,
"return_full_text": False,
"temperature": 0.0,
"do_sample": False,
}
def phi_model(text):
messages = [
{"role": "system", "content": "You work as a data analyst insights guru for a top gym in the UK and you want to find topics for improvements from customer reviews. You must return exactly 3 key topics as strings, strictly formatted as 'topic 1', 'topic 2', 'topic 3'."},
{"role": "user", "content": "In the following customer review interaction, pick out exactly 3 main topics and return them as a valid Python list: Broken equipment, overcrowded, dirty locker rooms, and unhelpful staff. Look elsewhere!."},
{"role": "assistant", "content": "'Broken equipment', 'overcrowded', 'unhelpful staff'"},
{"role": "user", "content": f"In the following customer review interaction, pick out exactly 3 main topics and return only those 3 as a string: {text}"},
]
output = pipe(messages, **generation_args)[0]['generated_text'].replace("'", '"')
return output
phi_neg["topics"] = phi_neg["Comment"].apply(phi_model)
phi_neg.to_csv("phi_model.csv", index=False)
model_phi = BERTopic(verbose=True)
topics_phi, probabilities_phi = model_phi.fit_transform(phi_neg['topics'].tolist())
2025-04-01 05:20:24,451 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 0%| | 0/198 [00:00<?, ?it/s]
2025-04-01 05:20:26,769 - BERTopic - Embedding - Completed ✓ 2025-04-01 05:20:26,770 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm 2025-04-01 05:20:41,070 - BERTopic - Dimensionality - Completed ✓ 2025-04-01 05:20:41,072 - BERTopic - Cluster - Start clustering the reduced embeddings 2025-04-01 05:20:41,312 - BERTopic - Cluster - Completed ✓ 2025-04-01 05:20:41,316 - BERTopic - Representation - Fine-tuning topics using representation models. 2025-04-01 05:20:41,456 - BERTopic - Representation - Completed ✓
bert_visuals(model_phi)
Total number of topics: 101 Outliers: Topic Count 1 -1 2247 Top topics along with their document frequencies: Topic Count 2 0 373 7 1 199 11 2 182 23 3 144 3 4 141 36 5 139 26 6 97 49 7 93 38 8 88 6 9 80 Topic 0 has the following top words: [('unhelpful', 0.12300326579982984), ('overcrowded', 0.10022180631271357), ('broken', 0.09263096587012651), ('staff', 0.0736963666938909), ('equipment', 0.038364977494423844), ('away', 0.0023281513814752044), ('overcrowdedstaff', 0.002150786506725446), ('unavailability', 0.0019639597145591097), ('properly', 0.0017536863381075911), ('put', 0.0017204835562337562)] Topic 1 has the following top words: [('gym', 0.023842928637476644), ('experience', 0.020038469076131472), ('poor', 0.01742995207675169), ('workout', 0.01630277849121239), ('worse', 0.016142740962067064), ('inadequate', 0.01405116986141448), ('faulty', 0.01307668117884842), ('terrible', 0.012086116101734209), ('worst', 0.011972406023336333), ('equipment', 0.011579355751910698)]
def phi_improvements(text):
messages_1 = [
{"role": "system", "content": "You work as a data analyst insights guru for a top gym in the UK and you want to compress a given list into collated topics. You should return these in a numbered list as business insights that can be used to improve the business"},
{"role": "user", "content": "In the following list containing the main extracted topics from customer reviews, group or compress the topics and return them with actionable insights in a numbered list: [Poor service experience','Comparison of gym and service','Overall satisfaction','Service improvement needed','bad quality','Cleanliness of establishment','Neutral dining experience','Potential health and safety concerns','Overall satisfaction',]"},
{"role": "assistant", "content": "1. Cleanliness: Increase cleaning frequency and enforce hygiene rules. \n 2. Equipment maintenance: Repair faulty machines promptly and schedule regular maintenance. \n ..."},
{"role": "user", "content": f"In the following list containing the main extracted topics from customer reviews, group or compress the topics and return them with actionable insights in a numbered list: {text}"},
]
output = pipe(messages_1, **generation_args)[0]['generated_text']
return output
split_topic = np.array_split( phi_neg['topics'].tolist(), 2)
for i in split_topic:
print(phi_improvements(i))
1. Overcrowding: Implement a reservation system to manage gym capacity and reduce overcrowding. 2. Customer service: Train staff to improve customer service and professionalism. 3. Equipment maintenance: Regularly check and repair gym equipment to prevent breakdowns. 4. Hygiene: Increase cleaning frequency and enforce hygiene rules to maintain clean facilities. 5. Staff behavior: Provide training to staff to improve their behavior and professionalism. 6. Gym atmosphere: Create a welcoming and motivating gym atmosphere to enhance the customer experience. 7. App functionality: Improve the app's functionality to reduce wasted time and address day pass and turnstile access issues. 1. Temperature control: Install adjustable thermostats in showers and ensure consistent water temperature. 2. Membership fees: Review pricing structure and consider offering tiered membership options. 3. Outdoor facilities: Regularly maintain and upgrade outdoor amenities to enhance user experience. 4. Long wait times: Implement a more efficient booking system and increase staff during peak hours. 5. Lack of customer service: Provide regular training to staff on customer service skills and communication. 6. Improve communication: Establish clear communication channels and provide regular updates to members. 7. Hygiene cleanliness standards: Enforce strict hygiene protocols and conduct regular inspections. 8. Changing room maintenance: Regularly clean and maintain changing rooms to ensure a pleasant experience. 9. Staff management: Invest in staff training and development to improve overall service quality. 10. Shower temperature: Install adjustable thermostats in showers and ensure consistent water temperature. 11. Cost saving: Review and optimize operational costs to offer more competitive membership fees. 12. Millhouses: Regularly maintain and upgrade facilities to enhance user experience. 13. Overcrowded: Implement a booking system to manage peak times and prevent overcrowding. 14. Dirty locker rooms: Regularly clean and maintain locker rooms to ensure a pleasant experience. 15. Unhelpful staff: Provide regular training to staff on customer service skills and communication.
(Three new topics have been created for each negative review using the Phi model. These new topics were then passed to BERTopic for further topic modeling.)
(The newly created topics were passed back to the Phi model to generate actionable insights.)
comments_neg = phi_neg["Comment"]
tokenized_docs = [word_tokenize(comment) for comment in comments_neg]
dictionary = corpora.Dictionary(tokenized_docs)
dictionary.filter_extremes(no_below=2, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
num_topics = 10
passes = 20
lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=passes)
for idx, topic in lda_model.print_topics(-1):
print("Topic: {} \nWords: {}".format(idx, topic))
Topic: 0 Words: 0.027*"staff" + 0.017*"manager" + 0.015*"rude" + 0.013*"member" + 0.011*"one" + 0.011*"members" + 0.009*"said" + 0.008*"personal" + 0.008*"left" + 0.008*"customer" Topic: 1 Words: 0.046*"water" + 0.030*"showers" + 0.025*"cold" + 0.024*"broken" + 0.020*"shower" + 0.017*"months" + 0.015*"hot" + 0.015*"machine" + 0.014*"machines" + 0.013*"order" Topic: 2 Words: 0.018*"membership" + 0.017*"email" + 0.016*"day" + 0.015*"pin" + 0.015*"get" + 0.014*"code" + 0.014*"fee" + 0.013*"joining" + 0.013*"pass" + 0.011*"account" Topic: 3 Words: 0.023*"parking" + 0.014*"pure" + 0.013*"back" + 0.011*"puregym" + 0.010*"park" + 0.009*"one" + 0.009*"free" + 0.009*"car" + 0.007*"date" + 0.007*"would" Topic: 4 Words: 0.015*"staff" + 0.013*"members" + 0.013*"people" + 0.011*"management" + 0.010*"like" + 0.010*"puregym" + 0.008*"member" + 0.008*"machine" + 0.007*"time" + 0.007*"one" Topic: 5 Words: 0.030*"membership" + 0.013*"cancel" + 0.012*"pay" + 0.011*"month" + 0.011*"would" + 0.011*"use" + 0.010*"gyms" + 0.009*"get" + 0.009*"go" + 0.008*"puregym" Topic: 6 Words: 0.049*"air" + 0.019*"conditioning" + 0.016*"music" + 0.016*"hot" + 0.015*"con" + 0.015*"temperature" + 0.009*"classes" + 0.009*"loud" + 0.009*"pt" + 0.009*"working" Topic: 7 Words: 0.039*"classes" + 0.039*"class" + 0.017*"one" + 0.014*"time" + 0.013*"get" + 0.010*"minutes" + 0.010*"booked" + 0.009*"instructor" + 0.009*"first" + 0.009*"work" Topic: 8 Words: 0.018*"equipment" + 0.016*"changing" + 0.015*"machines" + 0.010*"dirty" + 0.010*"toilets" + 0.010*"always" + 0.010*"place" + 0.009*"staff" + 0.009*"cleaning" + 0.009*"weights" Topic: 9 Words: 0.052*"equipment" + 0.025*"people" + 0.024*"use" + 0.019*"busy" + 0.018*"gyms" + 0.018*"machines" + 0.013*"many" + 0.012*"time" + 0.011*"go" + 0.011*"get"
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)