The question you are trying to determine the answer to is:

Are more goals scored in women’s international soccer matches than men’s?

You assume a 10% significance level, and use the following null and alternative hypotheses:

\(H_0\) : The mean number of goals scored in women’s international soccer matches is the same as men’s.

\(H_A\) : The mean number of goals scored in women’s international soccer matches is greater than men’s.

library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.2.1     ✔ stringr 1.4.1
## ✔ readr   2.1.3     ✔ forcats 0.5.2
## ✔ purrr   1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
men = read.csv("data/men_results.csv")
head(men)
women = read.csv("data/women_results.csv")
head(women)
# checking for NA values
c(sum(is.na(women)),sum(is.na(men)))
## [1] 0 0
# there are  lot of tournament types in men category
length(unique(men$tournament))
## [1] 141
ggplot(women, aes(tournament)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(x = "Tournament title", y = "Count", title = "Frequency of Tournament types")  

# Filtering the only matches : FIFA and since 2002-01-01.

men <- men %>%
  filter(tournament == "FIFA World Cup", date > "2002-01-01") %>%
  mutate(total_score = home_score + away_score)

women <- women %>%
  filter(tournament == "FIFA World Cup", date > "2002-01-01") %>%
  mutate(total_score = home_score + away_score)

c(nrow(men), nrow(women))
## [1] 384 200
head(women)
plot_men = ggplot(men, aes(total_score)) +
  geom_histogram(bins = 30) +
  xlab("Goals scored") +
  ylab("Frequency") +
  ggtitle("Goals scored (Men)")

plot_women = ggplot(women, aes(total_score)) +
  geom_histogram(bins = 30) +
  xlab("Goals scored") +
  ylab("Frequency") +
  ggtitle("Goals scored (Women)")

grid.arrange(plot_men, plot_women, nrow = 1)

# Data for men's and women's soccer matches are not normally distributed, that's why
# Run a Wilcoxon-Mann-Whitney test on goals_scored vs. group

test_results <- wilcox.test(
  x = women$total_score,
  y = men$total_score,
  alternative = "greater"
)

# Determine hypothesis test result using sig. level
p_val <- round(test_results$p.value, 4)
result <- ifelse(p_val <= 0.01, "reject", "fail to reject")

# Create the result data frame
result_df <- data.frame(p_val, result)
result_df