Introduction

This bank customer segmentation was performed following data cleaning and initial EDA (not shown). RFM scoring and K-means clustering were applied to identify distinct customer groups based on behavioral and demographic characteristics. The results support targeted marketing strategies, reduce churn risk, and enable the design of personalized banking services.

Data Preparation

Load required packages

library(tidyverse)
library(ggforce)
library(janitor)
library(scales)
library(factoextra)
library(gridExtra)
options(scipen = 999)

Load the dataset

data <- read.csv("cleanedBankDataset.csv")

Standardize column names

data <- clean_names(data)

Convert date format

data <- data %>%
  mutate(transaction_date = ymd(transaction_date))

RFM Analysis

RFM is a marketing technique used to evaluate customer value based on three key variables.

In this case ( a banking scenario):

Recency: The number of days since the customer’s last transaction.
Frequency: Total number of transactions within a defined period.
Monetary: Total transaction value during that same time frame.

Each customer is assigned a score from 1 to 5 for each variables. These scores are then combined to rank and group customers into meaningful segments for targeted strategies.

Calculate RFM metrics

# set reference data as the max transaction data
ref_date <- max(data$transaction_date, na.rm = TRUE)

# extract customer info
cust_info <- data %>%
  group_by(customer_id) %>%
  slice(1) %>%
  select(customer_id, cust_gender, customer_age) %>%
  ungroup() %>%
  mutate(cust_gender = if_else(cust_gender == "M", 1, 0))

# compute RFM metrics
rfm <- data %>%
  group_by(customer_id) %>%
  summarise(
    recency = as.integer(ref_date - max(transaction_date, na.rm = TRUE)),
    frequency = n(),
    monetary = sum(transaction_amount_inr, na.rm = TRUE),
    avg_transaction_amount = mean(transaction_amount_inr, na.rm = TRUE),
    last_transaction_amount = last(transaction_amount_inr),
    avg_account_balance = mean(cust_account_balance, na.rm = TRUE),
    last_account_balance = last(cust_account_balance),
    .groups = "drop"
  )

# join customer info
rfm <- rfm %>%
  left_join(cust_info, by = "customer_id")

# view the dataframe
head(rfm)

summary(rfm)

##  customer_id           recency        frequency        monetary      
##  Length:682904      Min.   : 0.00   Min.   :1.000   Min.   :      0  
##  Class :character   1st Qu.:45.00   1st Qu.:1.000   1st Qu.:    150  
##  Mode  :character   Median :57.00   Median :1.000   Median :    405  
##                     Mean   :56.81   Mean   :1.001   Mean   :   1358  
##                     3rd Qu.:70.00   3rd Qu.:1.000   3rd Qu.:   1061  
##                     Max.   :81.00   Max.   :3.000   Max.   :1560035  
##  avg_transaction_amount last_transaction_amount avg_account_balance
##  Min.   :      0        Min.   :      0         Min.   :       0   
##  1st Qu.:    150        1st Qu.:    150         1st Qu.:    4382   
##  Median :    404        Median :    404         Median :   15203   
##  Mean   :   1357        Mean   :   1357         Mean   :   87387   
##  3rd Qu.:   1060        3rd Qu.:   1060         3rd Qu.:   49568   
##  Max.   :1560035        Max.   :1560035         Max.   :42331377   
##  last_account_balance  cust_gender      customer_age  
##  Min.   :       0     Min.   :0.0000   Min.   :18.00  
##  1st Qu.:    4382     1st Qu.:0.0000   1st Qu.:25.00  
##  Median :   15203     Median :1.0000   Median :28.00  
##  Mean   :   87388     Mean   :0.7237   Mean   :29.58  
##  3rd Qu.:   49568     3rd Qu.:1.0000   3rd Qu.:33.00  
##  Max.   :42331377     Max.   :1.0000   Max.   :47.00

Add RFM scores

rfm <- rfm %>%
  mutate(
    recency_score = 6 - ntile(recency, 5),
    frequency_score = ntile(frequency, 5),
    monetary_score = ntile(monetary, 5),
    rfmscore = round((recency_score + frequency_score + monetary_score) / 3, 2),
    rfmLevel = paste0(recency_score, frequency_score, monetary_score)
  )

Assign segments based on RFM scores

rfm <- rfm %>%
  mutate(
    segment = case_when(
      rfmscore >= 4.5 ~ "Champions",
      rfmscore >= 4.0 ~ "Loyal Customers",
      rfmscore >= 3.0 ~ "Potential Loyalists",
      rfmscore >= 2.0 ~ "Needs Attention",
      rfmscore >= 1.0 ~ "Hibernating",
      TRUE ~ "At Risk"
    )
  )

Plot segmentation distribution

rfm %>%
  count(segment) %>%
  mutate(proportion = round(n / sum(n), 2),
         n_fmt = comma(n),
         percent_fmt = percent(proportion)) %>%
  ggplot(aes(x = reorder(segment, -n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_text(aes(label = paste0(n_fmt, " (", percent_fmt, ")")), vjust = -0.5) +
  scale_y_continuous(labels = comma) +
  labs(title = "Customer Segments by RFM", y = "Customer Count", x = "Segment") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold",size = 15,
                                  hjust = 0.5,vjust = 0.5),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(color = "black"),
        axis.ticks = element_line(color = "black")
  )

K-Means Clustering

Data Preprocessing

# select features for clustering
rfm_features <- rfm %>%
  select(recency, frequency, monetary, cust_gender, customer_age)

# scale features
rfm_scaled <- scale(rfm_features)

Find the Optimal Number of Clusters

# sample 10,000
set.seed(123)
rfm_sampled <- rfm_scaled[sample(nrow(rfm_scaled), 10000), ]

# elbow method
fviz_nbclust(
  rfm_sampled,
  FUN = function(x, k) kmeans(x, centers = k, nstart = 25, iter.max = 100),
  method = "wss",
  k.max = 10
) +
  labs(title = "Elbow Method for Optimal k",
       subtitle = "Based on a sample of 10,000 observations")

Perform Clustering (k = 4)

k <- 4
set.seed(123)

# sample 500,000 for initial clustering
sample_indices <- sample(nrow(rfm_scaled), 500000)
sample_data <- rfm_scaled[sample_indices, ]

# initial clustering
kmeans_sample <- kmeans(sample_data, centers = 4, nstart = 25, iter.max = 100)
initial_centers <- kmeans_sample$centers

# final clustering
kmeans_result <- kmeans(rfm_scaled, centers = initial_centers, 
                        nstart = 1, iter.max = 30)

Add cluster labels to data

rfm$cluster <- factor(kmeans_result$cluster)

PCA Projection

fviz_cluster(kmeans_result, data = rfm_scaled,
             geom = "point", ellipse.type = "norm",
             palette = c("steelblue1", "ivory3", "darkorange", "yellow"),
             shape = 19, pointsize = 0.5, repel = TRUE,
             ggtheme = theme_minimal())+
             labs(title = "Customer Clusters (PCA Projection)", 
                  subtitle = "Based on RFM + Demographic Features")

Cluster Analysis

Show cluster sizes

cat("Cluster sizes:\n")

## Cluster sizes:

print(table(rfm$cluster))

## 
##      1      2      3      4 
##   1905 492234    842 187923

Between/Total SS ratio

round(kmeans_result$betweenss / kmeans_result$totss, 3)

## [1] 0.48

Cluster profiling

cluster_summary <- rfm %>%
  group_by(cluster) %>%
  summarise(
    count = n(),
    avg_recency = round(mean(recency), 1),
    avg_frequency = round(mean(frequency), 1),
    avg_monetary = round(mean(monetary), 1),
    avg_balance = round(mean(avg_account_balance), 1),
    avg_gender = round(mean(cust_gender), 1),
    avg_age = round(mean(customer_age, na.rm = TRUE), 1)
  )
print(cluster_summary)

## # A tibble: 4 × 8
##   cluster  count avg_recency avg_frequency avg_monetary avg_balance avg_gender
##   <fct>    <int>       <dbl>         <dbl>        <dbl>       <dbl>      <dbl>
## 1 1         1905        56               1       73415.     426304.        0.7
## 2 2       492234        56.8             1        1099.      87382.        1  
## 3 3          842        49.9             2        2090       82857.        0.8
## 4 4       187923        56.9             1        1304.      83985.        0  
## # ℹ 1 more variable: avg_age <dbl>

Visualization

# Reshape
cluster_long <- cluster_summary %>%
  select(-count) %>%
  pivot_longer(-cluster, names_to = "Metric", values_to = "Value")

# Define colors for clusters
cluster_colors <- c("1" = "steelblue1", "2" = "ivory3", 
                    "3" = "darkorange", "4" = "yellow")

# plot it
features <- unique(cluster_long$Metric)
plot_list <- lapply(features, function(f) {
  cluster_long %>%
    filter(Metric == f) %>%
    ggplot(aes(x = cluster, y = Value, fill = cluster)) +
    geom_col(position = "dodge") +
    labs(title = paste("Cluster Comparison for", f), 
         y = f, x = "Cluster") +
    theme_minimal() +
    theme(
      plot.title = element_text(face = "bold", size = 15, 
                                hjust = 0.5, vjust = 0.5),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      axis.line = element_line(color = "black"),
      axis.ticks = element_line(color = "black")
    ) +
    scale_fill_manual(values = cluster_colors)
})

do.call(grid.arrange, c(plot_list[1:2], nrow = 1))

do.call(grid.arrange, c(plot_list[3:4], nrow = 1))

do.call(grid.arrange, c(plot_list[5:6], nrow = 1))

Cluster Insights:

Cluster 1 - Small, High Value: A small but highly valuable group of older customers with high account balances and spending. They engage less frequently but contribute significantly.
Cluster 2 - Inactive Males: The largest segment, dominated by male customers with low recent activity, low frequency and low value. They show signs of disengagement.
Cluster 3 - Active Low Spender: A tiny group of younger, highly engaged customers. They transact frequently and recently, but contribute low monetary value. Their loyalty presents upsell opportunities.
Cluster 4 - Hibernating Females: A mid-sized group of the youngest, all-female customers with low engagement and value. They are largely inactive but may respond to targeted offers.

Indian Bank Insights

RFM Analysis & K-Means Clustering & PCA

Lexin Li