Making Synthesia-style videos

import mido
import PIL
import numpy as np
import os
import subprocess

input_midi = "haunted.mid"
frames_folder = "single_frames"

image_width  = 1280
image_height = 720
piano_height = round(image_height/6)
black_key_height = 2/3
pressed_key_colour = [200, 128, 128]

# Speed in main-image-heights per second
vertical_speed = 1/4
fps = 25

main_height = image_height - piano_height
time_per_pixel = 1/(main_height*vertical_speed)
pixels_per_frame = main_height*vertical_speed / fps

# Only used in the print-out of the notes; not relevant to the video:
accidentals = "flat"

white_notes = {0: "C", 2: "D", 4: "E", 5: "F", 7: "G", 9: "A", 11: "B"}
sharp_notes = {1: "C#", 3: "D#", 6: "F#", 8: "G#", 10: "A#"}
flat_notes  = {1: "Bb", 3: "Eb", 6: "Gb", 8: "Ab", 10: "Bb"}

white_notes_scale = {0: 0, 2: 1, 4: 2, 5: 3, 7: 4, 9: 5, 11: 6}

note_names = {}

def note_breakdown(midi_note):
  note_in_chromatic_scale = midi_note % 12
  octave = round((midi_note - note_in_chromatic_scale) / 12 - 1)
  
  return [note_in_chromatic_scale, octave]


for note in range(21, 109):
  [note_in_chromatic_scale, octave] = note_breakdown(note)
  
  if note_in_chromatic_scale in white_notes:
    note_names[note] = "{}{:d}".format(
      white_notes[note_in_chromatic_scale], octave)
  else:
    if accidentals == "flat":
      note_names[note] = "{}{:d}".format(
        flat_notes[note_in_chromatic_scale], octave)
    else:
      note_names[note] = "{}{:d}".format(
        sharp_notes[note_in_chromatic_scale], octave)


def is_white_key(note):
  return (note % 12) in white_notes

input_file = mido.MidiFile(input_midi)
track = input_file.tracks[0]
ticks_per_beat = input_file.ticks_per_beat

# The 'notes' list will store each note played, with start and end
# times in seconds.
notes = []
notes_on  = 0

# The MIDI file comprises a number of messages.  The time given in
# a message is the time since the previous message, and is in units
# of ticks.
current_t = 0

for msg in track:
  if msg.type == "note_on":
    notes.append({"note": msg.note,
                  "start": (current_t + msg.time)/(2*ticks_per_beat),
                  "end": 0})
    notes_on += 1
  elif msg.type == "note_off":
    # Loop backwards to find which note just ended:
    for i in range(notes_on - 1, -1, -1):
      if notes[i]["note"] == msg.note:
        notes[i]["end"] = (current_t + msg.time)/(2*ticks_per_beat)
        break
  
  current_t += msg.time

# Print-out of the notes, to check that the file has been parsed
# correctly:
for note in notes:
  print("Note = {}, start = {:.2f}, duration = {:.2f}".format(
    note_names[note["note"]],
    note["start"],
    note["end"] - note["start"]))


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ The rest of the code is about making the video. ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

def pixel_range(midi_note, image_width):
  # Returns the min and max x-values for a piano key, in pixels.
  
  width_per_white_key = image_width / 52
  
  if is_white_key(midi_note):
    [in_scale, octave] = note_breakdown(midi_note)
    offset = 0
    width = 1
  else:
    [in_scale, octave] = note_breakdown(midi_note - 1)
    offset = 0.5
    width = 0.5
  
  white_note_n = white_notes_scale[in_scale] + 7*octave - 5
  
  start_pixel = round(width_per_white_key*(white_note_n + offset)) + 1
  end_pixel   = round(width_per_white_key*(white_note_n + 1 + offset)) - 1
  
  if width != 1:
    mid_pixel = round(0.5*(start_pixel + end_pixel))
    half_pixel_width = 0.5*width_per_white_key
    half_pixel_width *= width
    
    start_pixel = round(mid_pixel - half_pixel_width)
    end_pixel   = round(mid_pixel + half_pixel_width)
  
  return [start_pixel, end_pixel]
  
  
if not os.path.isdir(frames_folder):
  os.mkdir(frames_folder)
  
# Delete all previous image frames:
for f in os.listdir(frames_folder):
  os.remove("{}/{}".format(frames_folder, f))

im_base = np.zeros((image_height, image_width, 3), dtype=np.uint8)

# Draw the piano, and the grey lines next to the C's for the main area:

key_start = image_height - piano_height
white_key_end = image_height - 1
black_key_end = round(image_height - (1-black_key_height)*piano_height)

im_lines = im_base.copy()

for i in range(21, 109):
  if is_white_key(i):
    [x0, x1] = pixel_range(i, image_width)
    im_base[key_start:white_key_end, x0:x1] = [255, 255, 255]
  
  if i % 12 == 0:
    # C
    im_lines[0:(key_start-1), (x0-2):(x0-1)] = [80, 80, 80]

for i in range(21, 109):
  if not is_white_key(i):
    [x0, x1] = pixel_range(i, image_width)
    im_base[key_start:black_key_end, x0:x1] = [0, 0, 0]

im_piano = im_base[key_start:white_key_end, :]

im_frame = im_base.copy()
im_frame += im_lines

# Timidity (the old version that I have!) always starts the audio
# at time = 0.  Add a second of silence to the start, and also
# keep making frames for a second at the end:
frame_start = notes[0]["start"] - 1
end_t = max(note["end"] for note in notes) + 1

# First frame:
for j in range(main_height):
  im_j = main_height - j - 1
  t = frame_start + time_per_pixel*j
  for note in notes:
    if note["start"] <= t <= note["end"]:
      [x0, x1] = pixel_range(note["note"], image_width)
      im_frame[im_j, x0:x1] = [255, 0, 0]

img = PIL.Image.fromarray(im_frame)
img.save("{}/frame00000.png".format(frames_folder))


# Rest of video:
finished = False
frame_ct = 0
pixel_start = 0
pixel_start_rounded = 0

print("Starting images")

while not finished:
  frame_ct += 1
  if frame_ct % 100 == 0:
    print(frame_ct)
  
  prev_pixel_start_rounded = pixel_start_rounded
  pixel_start += pixels_per_frame
  pixel_start_rounded = round(pixel_start)
  
  pixel_increment = pixel_start_rounded - prev_pixel_start_rounded
  
  frame_start += 1/fps
  
  # Copy most of the previous frame into the new frame:
  im_frame[pixel_increment:main_height, :] = im_frame[0:(main_height - pixel_increment), :]
  im_frame[0:pixel_increment, :] = im_lines[0:pixel_increment, :]
  im_frame[key_start:white_key_end, :] = im_piano
  
  # Which keys need to be coloured?
  keys_to_colour = []
  for note in notes:
    if note["start"] <= frame_start <= note["end"]:
      keys_to_colour.append(note["note"])
  
  # Draw the new pixels at the top of the frame:
  for j in range(pixel_increment):
    t = frame_start + time_per_pixel*(main_height - j - 1)
    
    for note in notes:
      if note["start"] <= t <= note["end"]:
        [x0, x1] = pixel_range(note["note"], image_width)
        im_frame[j, x0:x1] = [255, 0, 0]  
  
  # First colour the white keys (this will cover some black-key pixels),
  # then re-draw the black keys either side,
  # then colour the black keys.
  for note in keys_to_colour:
    if is_white_key(note):
      [x0, x1] = pixel_range(note, image_width)
      im_frame[key_start:white_key_end, x0:x1] = pressed_key_colour
  
  for note in keys_to_colour:
    if is_white_key(note):
      if (not is_white_key(note - 1)) and (note > 21):
        [x0, x1] = pixel_range(note - 1, image_width)
        im_frame[key_start:black_key_end, x0:x1] = [0,0,0]
      
      if (not is_white_key(note + 1)) and (note < 108):
        [x0, x1] = pixel_range(note + 1, image_width)
        im_frame[key_start:black_key_end, x0:x1] = [0,0,0]
  
  for note in keys_to_colour:
    if not is_white_key(note):
      [x0, x1] = pixel_range(note, image_width)
      im_frame[key_start:black_key_end, x0:x1] = pressed_key_colour
  
  
  img = PIL.Image.fromarray(im_frame)
  img.save("{}/frame{:05d}.png".format(frames_folder, frame_ct))
  
  if frame_start >= end_t:
    finished = True


print("Calling Timidity")
subprocess.call("timidity {} -Ow --output-24bit -A120 -o output.wav".format(input_midi).split())


print("Calling ffmpeg")
# Running from a terminal, the long filter_complex argument needs to
# be in double-quotes, but the list form of subprocess.call requires
# _not_ double-quoting.
ffmpeg_cmd = "ffmpeg -framerate 25 -i {}/frame%05d.png -i output.wav -f lavfi -t 0.1 -i anullsrc -filter_complex [1]adelay=1000|1000[aud];[2][aud]amix -c:v libx264 -vf fps=25 -pix_fmt yuv420p -y out.mp4".format(frames_folder)

subprocess.call(ffmpeg_cmd.split())
Making Synthesia-style videos in Ubuntu

Recording a MIDI file in Ubuntu

Converting a MIDI file to video