You can download a zip file of the code, along with all output images and some R data here (5.4 MB).
The Senate formal preferences files are large – over a gigabyte for NSW – and I did the initial processing in perl. My perl is not overly fluent and doesn't conform to best practices, but it works, and perl flies through a file line-by-line much faster than R.
The perl script below reads the preferences file for a given state, and outputs
I didn't think I'd have four different distance calculations, and there's a bit of repetition that could be tidied up on that front.
The idea is that the output CSV's are then ready for use in R, though the NSW ATL votes file (used in PCA) was too large for my laptop when it only had 4 GB of RAM. I've since installed another 8 GB of RAM and now R has no problems with it.
# Set the state manually, over-ride with a command-line option if present:
$state = "sa";
if ($#ARGV == 0) {
$state = $ARGV[0];
}
# The input file is the formal preferences by polling place.
# The AEC gives these names like
# SenateStateFirstPrefsByPollingPlaceDownload-20499-NT.csv
# which I've renamed to prefs_nt.csv, prefs_nsw.csv, etc.
$infile_votes = "<pref_files/prefs_" . $state . ".csv";
# CSV file with my party/group abbreviations:
$infile_groups = "<groups/" . $state . "_groups.csv";
# Output file to store the ATL votes with no other information:
$outfile_atl = ">atl_votes_" . $state . ".csv";
# Output files for the graph distances, defined in various ways:
$outfile_graph_max = ">graph_distances/distances_max_" . $state . "_blankmax.csv";
$outfile_graph_avg = ">graph_distances/distances_max_" . $state . "_blankavg.csv";
$outfile_graph_asym = ">graph_distances/distances_max_" . $state . "_blankmax_asym.csv";
$outfile_graph_asym_wtd = ">graph_distances/distances_max_" . $state . "_blankmax_asym_wtd.csv";
# Number of BTL candidates (I could work these out later but easier
# to set them here).
if ($state eq "tas") {
$num_btl = 58;
} elsif ($state eq "wa") {
$num_btl = 79;
} elsif ($state eq "sa") {
$num_btl = 64;
} elsif ($state eq "qld") {
$num_btl = 122;
} elsif ($state eq "nsw") {
$num_btl = 151;
} elsif ($state eq "vic") {
$num_btl = 116;
} elsif ($state eq "act") {
$num_btl = 22;
} elsif ($state eq "nt") {
$num_btl = 19;
}
# Read the party abbreviations:
open(in_file_groups, $infile_groups);
$line = <in_file_groups>;
@atl_groups = ();
while (<in_file_groups>) {
$line = $_;
chomp($line);
# The CSV file may have commas in party names, but they
# don't exist in the first column (abbreviations), so no
# need for Text::CSV.
@cells = split(",", $line);
$this_group = $cells[0];
$this_group =~ s/\"//g;
push(@atl_groups, $this_group);
}
close(in_file_groups);
# Some useful variables for reading the preference lines:
$num_atl = $#atl_groups + 1;
$num_total = $num_atl + $num_btl;
$atl_i = $num_atl - 1;
$btl_start_i = $num_atl;
$btl_end_i = $num_total - 1;
# Initialise the "matrix" for each distances graph and the number
# of votes: stored as a hash, each element being an array equal to
# a row of the matrix.
%distances_max = {};
%distances_avg = {};
%distance_asym = {};
%distance_asym_wtd = {};
%N_votes = {};
%N_votes_asym = {};
%N_not_preffed = {};
for ($i = 0; $i < $num_atl; $i++) {
$distances_max{$i} = ();
$distances_avg{$i} = ();
$distances_asym{$i} = ();
$distances_asym_wtd{$i} = ();
$N_votes{$i} = ();
$N_votes_asym{$i} = ();
$N_not_preffed{$i} = ();
for ($j = 0; $j < $num_atl; $j++) {
${distances_max{$i}}[$j] = 0;
${distances_avg{$i}}[$j] = 0;
${distances_asym{$i}}[$j] = 0;
${distances_asym_wtd{$i}}[$j] = 0;
${N_votes{$i}}[$j] = 0;
${N_votes_asym{$i}}[$j] = 0;
${N_not_preffed{$i}}[$j] = 0;
}
}
# Time to start reading the preferences file.
open(in_file_votes, $infile_votes);
open(out_file_atl, $outfile_atl);
print out_file_atl join(",", @atl_groups) . "\n";
# First line of the file is the column headers, second
# line is a bunch of hyphens, so skip over these.
$line = <in_file_votes>;
$line = <in_file_votes>;
$line_ct = 3;
while(<in_file_votes>) {
$line = $_;
chomp($line);
if ($line_ct % 10000 == 0) {
# Progress update:
print $line_ct . "\n";
}
# The preferences are the only part of the line
# inside quotation marks, so isolate that part:
$pref_part = $line;
$pref_part =~ s/.*?\"//;
$pref_part =~ s/\"//;
# Replace / and * with 1:
$pref_part =~ s/\//1/g;
$pref_part =~ s/\*/1/g;
@full_prefs = split(",", $pref_part);
@atl_prefs = @full_prefs[0..$atl_i];
@btl_prefs = @full_prefs[$btl_start_i..$btl_end_i];
# Check BTL formality.
$formal_btl = 1;
for ($i = 1; $i <= 6; $i++) {
@this_numbers = grep { $_ == $i; } @btl_prefs;
if ($#this_numbers != 0) {
# Either a duplicated preference or an exhaust by preference 6.
$formal_btl = 0;
last;
}
}
if ($formal_btl == 0) {
# Vote is ATL.
# Initialise an array to store the groups
# in preference order:
@this_prefs = ();
# @atl_prefs might have duplicated preferences or
# other irregularities, so initialise a new array
# to be written to file:
@atl_to_print = ();
for ($i = 0; $i < $num_atl; $i++) {
# Initialise to the last preference...
# maybe this would be better left blank
# and processed as needed in R.
push(@atl_to_print, $num_atl);
}
for ($i = 1; $i <= $num_atl; $i++) {
@this_numbers = grep { $atl_prefs[$_] == $i; } 0..$#atl_prefs;
if ($#this_numbers != 0) {
# Exhaust or duplicate.
last;
} else {
push(@this_prefs, $this_numbers[0]);
$atl_to_print[$this_numbers[0]] = $i;
}
}
print out_file_atl join(",", @atl_to_print) . "\n";
# Add distances.
$this_prim = $this_prefs[0];
# For all distance types except asym_wtd, first add
# the "not preferenced" distance to all groups,
# then afterwards subtract it from the actually
# preferenecd groups.
for ($i = 0; $i < $num_atl; $i++) {
if ($i != $this_prim) {
$distances_max{$this_prim}[$i] += $num_atl - 1;
$distances_max{$i}[$this_prim] += $num_atl - 1;
$distances_avg{$this_prim}[$i] += 0.5*($num_atl + $#this_prefs);
$distances_avg{$i}[$this_prim] += 0.5*($num_atl + $#this_prefs);
$distances_asym{$this_prim}[$i] += $num_atl - 1;
$N_votes{$this_prim}[$i]++;
$N_votes{$i}[$this_prim]++;
$N_votes_asym{$this_prim}[$i]++;
$N_not_preffed{$this_prim}[$i]++;
}
}
for ($i = 1; $i <= $#this_prefs; $i++) {
$this_pref = $this_prefs[$i];
$distances_max{$this_prim}[$this_pref] += $i - ($num_atl - 1);
$distances_max{$this_pref}[$this_prim] += $i - ($num_atl - 1);
$distances_avg{$this_prim}[$this_pref] += $i - 0.5*($num_atl + $#this_prefs);
$distances_avg{$this_pref}[$this_prim] += $i - 0.5*($num_atl + $#this_prefs);
$distances_asym{$this_prim}[$this_pref] += $i - ($num_atl - 1);
# For asym_wtd we don't have to subtract off anything:
$distances_asym_wtd{$this_prim}[$this_pref] += $i;
$N_not_preffed{$this_prim}[$this_pref]--;
}
}
$line_ct++;
}
close(in_file);
$sum_not_preffed = 0;
for ($i = 0; $i < $num_atl; $i++) {
for ($j = 0; $j < $i; $j++) {
if ($N_votes{$i}[$j] > 0) {
$distances_max{$i}[$j] /= $N_votes{$i}[$j];
$distances_max{$j}[$i] /= $N_votes{$j}[$i];
$distances_avg{$i}[$j] /= $N_votes{$i}[$j];
$distances_avg{$j}[$i] /= $N_votes{$j}[$i];
$distances_asym{$i}[$j] /= $N_votes_asym{$i}[$j];
$distances_asym{$j}[$i] /= $N_votes_asym{$j}[$i];
# Re-symmetrise the asymmetric distances:
$distances_asym{$i}[$j] = 0.5*($distances_asym{$i}[$j] + $distances_asym{$j}[$i]);
$distances_asym{$j}[$i] = $distances_asym{$i}[$j];
}
}
# Preparation for the weighting of the not-preferenced votes:
for ($j = 0; $j < $num_atl; $j++) {
if ($i != $j) {
# Careful with the order of the indices here.
# In braces is the primary vote party; we
# want to sum a party's not-preferenced votes
# over all primary vote possibilities.
$N_not_preffed{$i}[$i] += $N_not_preffed{$j}[$i];
}
}
$sum_not_preffed += $N_not_preffed{$i}[$i];
}
$mean_not_preffed = $sum_not_preffed / $num_atl;
# Add the weighted not-preferenced votes:
for ($i = 0; $i < $num_atl; $i++) {
for ($j = 0; $j < $num_atl; $j++) {
if ($i != $j) {
$distances_asym_wtd{$i}[$j] += ($num_atl - 1) * $N_not_preffed{$i}[$j] * $mean_not_preffed / $N_not_preffed{$j}[$j];
$distances_asym_wtd{$i}[$j] /= $N_votes_asym{$i}[$j];
}
}
}
# Re-symmetrise:
for ($i = 0; $i < $num_atl; $i++) {
for ($j = 0; $j < $i; $j++) {
if ($i != $j) {
$distances_asym_wtd{$i}[$j] = 0.5*($distances_asym_wtd{$i}[$j] + $distances_asym_wtd{$j}[$i]);
$distances_asym_wtd{$j}[$i] = $distances_asym_wtd{$i}[$j];
}
}
}
open(out_file_graph_max, $outfile_graph_max);
open(out_file_graph_avg, $outfile_graph_avg);
open(out_file_graph_asym, $outfile_graph_asym);
open(out_file_graph_asym_wtd, $outfile_graph_asym_wtd);
print out_file_graph_max join(",", @atl_groups) . "\n";
print out_file_graph_avg join(",", @atl_groups) . "\n";
print out_file_graph_asym join(",", @atl_groups) . "\n";
print out_file_graph_asym_wtd join(",", @atl_groups) . "\n";
for ($i = 0; $i < $num_atl; $i++) {
$out_line_max = "";
$out_line_avg = "";
$out_line_asym = "";
$out_line_asym_wtd = "";
# There's probably a way to de-reference the hash
# so that you can just join() the array and print
# it to file, but I don't know how to do that.
# Instead I loop:
for ($j = 0; $j < $num_atl; $j++) {
$comma = ",";
if ($j == $num_atl - 1) { $comma = ""; }
$out_line_max .= $distances_max{$i}[$j] . $comma;
$out_line_avg .= $distances_avg{$i}[$j] . $comma;
$out_line_asym .= $distances_asym{$i}[$j] . $comma;
$out_line_asym_wtd .= $distances_asym_wtd{$i}[$j] . $comma;
}
print out_file_graph_max $out_line_max . "\n";
print out_file_graph_avg $out_line_avg . "\n";
print out_file_graph_asym $out_line_asym . "\n";
print out_file_graph_asym_wtd $out_line_asym_wtd . "\n";
}
close(out_file_graph_max);
close(out_file_graph_avg);
close(out_file_graph_asym);
close(out_file_graph_asym_wtd);
close(out_file_atl);
Now to R. I have a file called "htv_abbrevs.R" which is just a block of code (rather than a function) which has the Liberal(/National/etc.) and Labor how-to-vote cards – well, only one of them in each state – and a couple of lines to say to use Rise Up Australia instead of One Nation in the territories (used in orienting the second axis).
if (state == "nsw") {
state_title = "NSW"
lib_abbrev = "LNP"
alp_htv = c("Grn", "REP", "AJP", "ASXP", "LDP")
lib_htv = c("CDP", "SFF", "FF", "LDP", "AMEP")
}
if (state == "vic") {
state_title = "Vic"
lib_abbrev = "LNP"
alp_htv = c("ASXP", "DHJP", "REP", "Grn", "S/C")
lib_htv = c("FF", "AC", "DLP", "ACP", "DHJP")
}
if (state == "qld") {
state_title = "Qld"
lib_abbrev = "LNP"
alp_htv = c("Grn", "S/H", "JLN", "GLT", "KAP")
lib_htv = c("FF", "CDP", "SFF", "KAP", "AC")
}
if (state == "wa") {
state_title = "WA"
lib_abbrev = "Lib"
alp_htv = c("Grn", "H/S", "DHJP", "SFF", "Nat")
lib_htv = c("Nat", "AC", "SFF", "LDP", "FF")
}
if (state == "sa") {
state_title = "SA"
lib_abbrev = "Lib"
alp_htv = c("Grn", "AJP", "ME", "NXT", "AMEP")
lib_htv = c("FF", "CDP", "SFF", "AMEP", "NXT")
}
if (state == "tas") {
state_title = "Tas"
lib_abbrev = "Lib"
alp_htv = c("Grn", "JLN", "ARFP", "S/H", "REP")
lib_htv = c("CDP", "SFF", "LDP", "FF", "ALP")
}
if (state == "act") {
state_title = "ACT"
lib_abbrev = "Lib"
alp_htv = c("Grn", "ASXP", "AJP", "SPA", "LDP")
lib_htv = c("CDP", "LDP", "AJP", "SA", "ALP")
}
if (state == "nt") {
state_title = "NT"
lib_abbrev = "CLP"
alp_htv = c("Grn", "H/S", "CDP", "CLP", "CEC")
lib_htv = c("CDP", "ALP", "H/S", "Grn", "RUA")
}
if (state %in% c("act", "nt")) {
phon_rua = "RUA"
} else {
phon_rua = "PHON"
}
alp_htv = c("ALP", alp_htv)
lib_htv = c(lib_abbrev, lib_htv)
Now comes the plotting. First up is the PCA calculation, with all of the hard work done by prcomp()
.
library(dplyr)
library(readr)
library(ggplot2)
library(ggrepel)
state = "nsw"
# Garbage collection -- if you delete a variable,
# it may still take up space in memory, but gc()
# will free it up.
gc()
source("htv_abbrevs.R")
if (state %in% c("act", "nt")) {
# Only highlight the first three parties
# on the HTV's in the territories.
alp_htv = alp_htv[1:3]
lib_htv = lib_htv[1:3]
}
png_print = function(out_file, img_width, img_height, ggplot_img) {
# A function to print a plot to png:
png(filename=out_file, width=img_width, height=img_height, units="px")
print(ggplot_img)
dev.off()
return(TRUE)
}
in_file = sprintf("atl_votes_%s.csv", state)
prefs.df = read_csv(in_file)
# The calculation:
pref_pca = prcomp(prefs.df, scale. = TRUE)
pca_evals = pref_pca$sdev^2
pca_evals_cumul = cumsum(pca_evals)
fraction_variance = pca_evals_cumul / sum(pca_evals)
# A bit of fiddly work to classify the groups
# into HTV cards:
pca.df = as.data.frame(pref_pca$rotation)
pca.df$name = row.names(pca.df)
pca.df$ALP_HTV = ""
pca.df$Lib_HTV = ""
pca.df$ALP_HTV[which(pca.df$name %in% alp_htv)] = "ALP"
pca.df$Lib_HTV[which(pca.df$name %in% lib_htv)] = "Lib"
pca.df$HTV = sprintf("%s%s", pca.df$ALP_HTV, pca.df$Lib_HTV)
pca.df$HTV = gsub("ALPLib", "Both", pca.df$HTV)
pca.df$HTV[which(pca.df$HTV == "")] = "Neither"
# Orienting the axes: put ALP left of LNP,
# One Nation in the upper half-plane.
ALP_i = which(pca.df$name == "ALP")
LNP_i = which(pca.df$name == lib_abbrev)
PHON_i = which(pca.df$name == phon_rua)
if (pca.df$PC1[ALP_i] > pca.df$PC1[LNP_i]) {
pca.df$PC1 = -pca.df$PC1
}
if (pca.df$PC2[PHON_i] < 0) {
pca.df$PC2 = -pca.df$PC2
}
title_str = sprintf("PCA: %s ATL preferences", state_title)
out_file_png = sprintf("images/pca/%s_pc1_pc2.png", state)
out_file_variance = sprintf("pca_data/%s_fraction_variance.txt", state)
out_file_pca = sprintf("pca_data/%s_df.Rda", state)
htv_colours = c(ALP = "#FF0000", Lib = "#0000FF", Both = "#800080", Neither = "#000000")
circle_plot = ggplot(data=pca.df) +
geom_point(aes(x=PC1, y=PC2, colour=factor(HTV))) +
geom_text_repel(aes(x=PC1, y=PC2, label=name, colour=factor(HTV)), size=6, show.legend=FALSE) +
scale_color_manual(values=htv_colours, name="HTV cards") +
theme_grey(base_size = 16) +
guides(colour = guide_legend(override.aes = list(size=4))) +
theme(legend.text=element_text(size=16)) +
ggtitle(title_str)
# The calculation's not terribly slow, but it's
# quicker to load the factor loadings from disk than
# to calculate it again....
write(fraction_variance, file=out_file_variance, ncolumns = 1)
save("pca.df", file=out_file_pca)
png_print(out_file_png, 600, 500, circle_plot)
And finally the force-directed graphs. This calculation is not memory-intensive like the PCA may be, and I put everything in a big loop and just let it churn away from a while. Plots are translated rotated so that Labor and the main Coalition party are both at y = 0, equidistant from the origin with Labor on the left. Some number (50 for me) of trials are calculated for each set of parameters, and the locations averaged. The vertical axis is supposed to have One Nation in the upper half-plane, but that is only guaranteed for the first trial in a set. Subsequent trials compare the y-values of all parties to the first trial, and flip the results if that would make for closer agreement.
The iteration from randomised starting locations to equilibrium is inspired by the idea of springs with an equilibrium distance, but the calculations are not real physics – the points don't carry momentum, and it's only the positions that get incremented in proportion to how much "force" they feel.
library(dplyr)
library(readr)
library(ggplot2)
library(ggrepel)
# If you want to plot the convergence from
# the initial randomised locations to
# equilibrium, set to TRUE (slows it down a lot):
plot_intermediate_conv = FALSE
# If you want to plot individual trials as
# they're calculated, set to TRUE
# (they'll be over-written with a fixed set
# of axes afterwards, but it can be nice to
# check that it's running properly):
plot_intermediate = FALSE
# Each trial has a new randomised set of start
# locations. How many trials to do?
num_trials = 50
# The parameters to loop over.
blank_types = c("max", "avg", "max_asym", "max_asym_wtd")
distance_powers = c(1, 3)
states = c("nsw", "vic", "qld", "wa", "sa", "tas", "act", "nt")
png_print = function(out_file, img_width, img_height, ggplot_img) {
# A function to print a plot to png:
png(filename=out_file, width=img_width, height=img_height, units="px")
print(ggplot_img)
dev.off()
return(TRUE)
}
make_plot = function(x, group_names, HTV, htv_colours, title_str, x_lim=NULL, y_lim=NULL) {
# The main plotting routine.
locations.df = data.frame(x = x[, 1], y = x[, 2], names=group_names, HTV = HTV)
scatter_plot = ggplot() +
geom_point(data=locations.df, aes(x=x, y=y, colour=factor(HTV))) +
geom_text_repel(data=locations.df, aes(x=x, y=y, label=names, colour=factor(HTV)),
size=6, show.legend=FALSE) +
scale_x_continuous(limits=x_lim) + scale_y_continuous(limits=y_lim) +
coord_fixed(ratio = 1) +
scale_color_manual(values=htv_colours, name="HTV cards") +
theme_grey(base_size = 16) +
guides(colour = guide_legend(override.aes = list(size=4))) +
theme(legend.text=element_text(size=16)) +
ggtitle(title_str)
return(scatter_plot)
}
htv_colours = c(ALP = "#FF0000", Lib = "#0000FF", Both = "#800080", Neither = "#000000")
# The big nested loops:
for (distance_power in distance_powers) {
for (blank_type in blank_types) {
output_folder = sprintf("images/force_directed/trials/%s_%d", blank_type, distance_power)
dir.create(output_folder, showWarnings = FALSE)
for (state in states) {
print(sprintf("***** Starting %s %s %d *****", state, blank_type, distance_power))
source("htv_abbrevs.R")
if (state %in% c("act", "nt")) {
alp_htv = alp_htv[1:3]
lib_htv = lib_htv[1:3]
}
in_file = sprintf("graph_distances/distances_max_%s_blank%s.csv", state, blank_type)
distances.df = read_csv(in_file)
group_names = names(distances.df)
num_groups = length(group_names)
num_dims = 2
ALP_i = which(group_names == "ALP")
LNP_i = which(group_names == lib_abbrev)
PHON_i = which(group_names == phon_rua)
ALP_HTV = character(num_groups)
Lib_HTV = character(num_groups)
ALP_HTV[which(group_names %in% alp_htv)] = "ALP"
Lib_HTV[which(group_names %in% lib_htv)] = "Lib"
HTV = sprintf("%s%s", ALP_HTV, Lib_HTV)
HTV = gsub("ALPLib", "Both", HTV)
HTV[which(HTV == "")] = "Neither"
title_str = sprintf("Force-directed, %s,\n%s, d^%d", state_title, blank_type, distance_power)
# Here we go. Normalise the distance matrix so that its
# highest element is 1, and raise all values to the
# current power:
dist_matrix = as.matrix(distances.df)
dist_matrix = dist_matrix / max(dist_matrix)
dist_matrix = dist_matrix^distance_power
# Initialise a matrix to store the locations of each
# trial, and of the mean values:
x = matrix(0, nrow=num_groups, ncol = num_dims)
mean_x = 0*x
# Spring constant:
attraction_const = 1
# Increment value:
inc = 5e-2
# Tolerance to decide if it has converged:
tol = 1e-6
# Shouldn't need anywhere near a million iterations:
max_iter = 1000000
# The final co-ordinates will be stored in list_x,
# for retrieval at the end of the trials set:
list_x = list()
# First row min, second row max:
axis_bounds = matrix(0, nrow=2, ncol=num_dims)
# Here we go.
for (i_trial in 1:num_trials) {
print(sprintf("Starting trial %d", i_trial))
# Randomise the start locations:
for (i in 1:num_dims) {
x[ , i] = runif(num_groups)
}
if (plot_intermediate_conv) {
locations.df = data.frame(x = x[, 1], y=x[, 2], names=group_names)
this_plot = ggplot() +
geom_text(data=locations.df, aes(x=x, y=y, label=names)) +
ggtitle("Initial")
png_print("images/force_directed/convergence/0000.png", 500, 500, this_plot)
}
for (i_iter in 1:max_iter) {
if (i_iter %% 10000 == 0) {
print(i_iter)
}
prev_x = x
# r will store the vectors between party locations,
# one list entry for each vector component, each
# list entry a matrix (i, j).
r = list()
length(r) = num_dims
for (i in 1:num_dims) {
r[[i]] = outer(x[ , i], rep(1, num_groups)) - outer(rep(1, num_groups), x[ , i])
# current_dist is a matrix soon to hold the Euclidean
# distance between each pair of points:
if (i == 1) {
current_dist = r[[i]]*r[[i]]
} else {
current_dist = current_dist + r[[i]]*r[[i]]
}
}
current_dist = sqrt(current_dist)
# Which direction are the springs forcing in?
spring_signs = 2*(current_dist > dist_matrix) - 1
# Calculate forces and increment locations:
for (i in 1:num_dims) {
F_springs_i = -attraction_const * spring_signs * (abs(current_dist - dist_matrix)) * r[[i]]
F_i = rowSums(F_springs_i)
x[ , i] = x[ , i] + inc*F_i
}
if (plot_intermediate_conv) {
out_file = sprintf("images/force_directed/convergence/%04d.png", i_iter)
locations.df = data.frame(x = x[, 1], y=x[, 2], names=group_names)
this_plot = ggplot() +
geom_text(data=locations.df, aes(x=x, y=y, label=names)) +
ggtitle(i_iter)
png_print(out_file, 500, 500, this_plot)
}
if (abs(max(x - prev_x)) < tol) {
# Convergence!
break
}
}
# Translate and rotate.
ALP_pos = as.vector(x[ALP_i, ])
LNP_pos = as.vector(x[LNP_i, ])
mid_x = 0.5*(ALP_pos[1] + LNP_pos[1])
mid_y = 0.5*(ALP_pos[2] + LNP_pos[2])
delta_x = LNP_pos[1] - ALP_pos[1]
delta_y = LNP_pos[2] - ALP_pos[2]
x[ , 1] = x[ , 1] - mid_x
x[ , 2] = x[ , 2] - mid_y
theta = atan2(delta_y, delta_x)
rot_matrix = matrix(0, nrow=2, ncol=2)
rot_matrix[1, ] = c(cos(theta), sin(theta))
rot_matrix[2, ] = c(-sin(theta), cos(theta))
x = t(rot_matrix %*% t(x))
# Flip secondary axis if necessary:
if (x[PHON_i, 2] < 0) {
x[ , 2] = -x[ , 2]
}
if (i_trial == 1) {
mean_x = x
} else {
# The PHON_i lines above are supposed to ensure that
# One Nation appears in the upper half-plane, but maybe
# it's close to y=0 and it ended up in the lower half-plane.
# So, check if the current arrangement is close to the current one.
test_x = x
test_x[ , 2] = -test_x[ , 2]
diff_1 = sum(abs(mean_x - x))
diff_2 = sum(abs(mean_x - test_x))
if (diff_2 < diff_1) {
x = test_x
}
mean_x = mean_x + x
}
# Write/print to file:
out_file_png = sprintf("%s/%s_%04d.png", output_folder, state, i_trial)
out_file_csv = sprintf("%s/%s_%04d.csv", output_folder, state, i_trial)
if (plot_intermediate) {
scatter_plot = make_plot(x, group_names, HTV, htv_colours, title_str)
png_print(out_file_png, 600, 500, scatter_plot)
}
# Store the final co-ordinates, update the overall
# axis bounds if necessary.
list_x[[i_trial]] = x
if (i_trial == 1) {
for (i in 1:num_dims) {
axis_bounds[1, i] = min(x[ , i])
axis_bounds[2, i] = max(x[ , i])
}
} else {
for (i in 1:num_dims) {
this_min = min(x[, i])
this_max = max(x[ ,i])
if (this_min < axis_bounds[1, i]) {
axis_bounds[1, i] = this_min
}
if (this_max > axis_bounds[2, i]) {
axis_bounds[2, i] = this_max
}
}
}
write(t(x), file = out_file_csv, ncolumns = num_dims, sep = ",")
}
# Re-print the intermediate plots, now with fixed axis extents.
x_bounds = c(axis_bounds[1, 1], axis_bounds[2, 1])
y_bounds = c(axis_bounds[1, 2], axis_bounds[2, 2])
for (i_trial in 1:num_trials) {
x = list_x[[i_trial]]
out_file = sprintf("%s/%s_%04d.png", output_folder, state, i_trial)
scatter_plot = make_plot(x, group_names, HTV, htv_colours, title_str, x_bounds, y_bounds)
png_print(out_file, 600, 500, scatter_plot)
}
# And now the final plot:
mean_x = mean_x / num_trials
out_file = sprintf("images/force_directed/%s_%s_%d.png", state, blank_type, distance_power)
final_plot = make_plot(mean_x, group_names, HTV, htv_colours, title_str)
png_print(out_file, 600, 500, final_plot)
}
}
}
The R code for the force-directed graphs where the axes are optimised sequentially is similar to the above, and is in the download file.
Posted 2016-08-25.