Executive Summary

This report provides a comprehensive analysis of clinical trial data. Key highlights include:

  • Total number of studies: 913
  • Most common study status: COMPLETED
  • Average enrollment: 994 participants

Data Overview

reactable(
  ctg_data,
  filterable = TRUE,
  searchable = TRUE,
  bordered = TRUE,
  striped = TRUE,
  highlight = TRUE,
  compact = TRUE,
  defaultPageSize = 10,
  columns = list(
    `Study Status` = colDef(
      style = function(value) {
        status_colors <- get_colors(length(unique(ctg_data$`Study Status`)))
        color <- status_colors[match(value, unique(ctg_data$`Study Status`))]
        list(background = color, color = 'white')
      }
    ),
    Enrollment = colDef(
      format = colFormat(separators = TRUE)
    ),
    `Start Date` = colDef(
      format = colFormat(date = TRUE)
    ),
    `Completion Date` = colDef(
      format = colFormat(date = TRUE)
    )
  ),
  defaultColDef = colDef(
    cell = function(value) {
      if (is.character(value)) {
        value <- truncate_text(value)
      }
      value
    },
    minWidth = 100
  )
)

Data Quality Assessment

missing_data <- sapply(ctg_data, function(x) sum(is.na(x)))
missing_data_ctg_data <- data.frame(Variable = names(missing_data),
                              MissingCount = missing_data,
                              PercentMissing = missing_data / nrow(ctg_data) * 100)
missing_data_ctg_data <- missing_data_ctg_data[order(-missing_data_ctg_data$PercentMissing), ]

ggplot(missing_data_ctg_data, aes(x = reorder(Variable, -PercentMissing), y = PercentMissing)) +
  geom_bar(stat = 'identity', fill = color_palette[1]) +
  theme_minimal() +
  labs(title = 'Percentage of Missing Data by Variable',
       x = 'Variable',
       y = 'Percent Missing') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(labels = scales::percent_format(scale = 1))

The chart above shows the percentage of missing data for each variable. Variables with high percentages of missing data may require further investigation or imputation techniques.

Study Status Distribution

status_counts <- table(ctg_data$`Study Status`)
status_ctg_data <- data.frame(status = names(status_counts), count = as.numeric(status_counts))

n_colors <- nrow(status_ctg_data)
status_colors <- get_colors(n_colors)

p <- ggplot(status_ctg_data, aes(x = reorder(status, -count), y = count, fill = status)) +
  geom_bar(stat = 'identity') +
  theme_minimal() +
  labs(title = 'Distribution of Study Statuses',
       x = 'Study Status',
       y = 'Count') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'none') +
  scale_fill_manual(values = status_colors) +
  geom_text(aes(label = count), vjust = -0.5)

ggplotly(p)

This chart shows the distribution of study statuses. The most common status is ‘COMPLETED’ with 443 studies.

Enrollment by Study Phase

phase_counts <- table(ctg_data$Phases)
n_colors <- length(phase_counts)
phase_colors <- get_colors(n_colors)

p <- ggplot(ctg_data, aes(x = Phases, y = Enrollment, fill = Phases)) +
  geom_boxplot(outlier.colour = 'red', outlier.shape = 1) +
  theme_minimal(base_size = 14) +
  labs(title = 'Enrollment by Study Phase',
       x = 'Study Phase',
       y = 'Enrollment') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = 'none') +
  scale_fill_manual(values = phase_colors) +
  scale_y_log10(labels = scales::comma_format())

ggplotly(p)

This boxplot visualizes the distribution of enrollment numbers across different study phases. Note the logarithmic scale on the y-axis to better show the wide range of enrollment numbers.

Study Duration Timeline

ctg_data$start_date <- as.Date(ctg_data$`Start Date`, format = '%Y-%m-%d')
ctg_data$completion_date <- as.Date(ctg_data$`Completion Date`, format = '%Y-%m-%d')
ctg_data$duration <- as.numeric(ctg_data$completion_date - ctg_data$start_date) / 365.25  # Duration in years

status_counts <- table(ctg_data$`Study Status`)
n_colors <- length(status_counts)
status_colors <- get_colors(n_colors)

p <- ggplot(ctg_data, aes(x = start_date, y = duration, color = `Study Status`)) +
  geom_point(alpha = 0.6) +
  theme_minimal() +
  labs(title = 'Study Duration Timeline',
       x = 'Start Date',
       y = 'Study Duration (Years)') +
  scale_color_manual(values = status_colors) +
  scale_x_date(date_labels = '%Y', date_breaks = '1 year') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplotly(p)

This scatter plot shows the relationship between study start dates and durations. Each point represents a study, colored by its status.

Funding Sources and Study Types

ctg_data_summary <- ctg_data %>%
  count(`Funder Type`, `Study Type`) %>%
  group_by(`Funder Type`) %>%
  mutate(prop = n / sum(n))

study_type_counts <- table(ctg_data$`Study Type`)
n_colors <- length(study_type_counts)
study_type_colors <- get_colors(n_colors)

p <- ggplot(ctg_data_summary, aes(x = `Funder Type`, y = prop, fill = `Study Type`)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  theme_minimal() +
  labs(title = 'Funding Sources and Study Types',
       x = 'Funder Type',
       y = 'Proportion') +
  scale_fill_manual(values = study_type_colors) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(labels = scales::percent_format())

ggplotly(p)

This stacked bar chart shows the proportion of different study types for each funder type.

Conclusion

This report provides a comprehensive overview of the clinical trial data, highlighting key trends in study status, enrollment, duration, and funding. The visualizations offer insights into the distribution and relationships within the data, which can be valuable for decision-making and further analysis.

For any questions or further analysis requests, please contact the report author.