#!/usr/bin/perl # tap2text.pl: a program for converting ZX Spectrum programs in .TAP files # into plain text. Reads from STDIN and to STDOUT. # Round-trip convertibility is intended. # Note that this version does not work with TZX input. # Version 1.0c # 2005-06-27 flag checking with case # 2009-03-28 minor corrections: corrected error in array definition, changed £ and © from chr(xxx) to string literal. # 2012-06-07 changed the way it reads data at start. # This program does not, and is not intended to, # validate ZX Spectrum BASIC code in any way, # except for rudimentary internal consistency checking, e.g. # whether stated program line lengths tally with actual line lengths. # If you downloaded this program from the Internet, make sure the extension # is .pl before running it. # © 2005 Alex Macfie, alex@cgce.net, http://www.cgce.net/ # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License http://www.fsf.org/licensing/licenses/gpl.txt # for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # Or visit http://www.fsf.org/licensing/licenses/gpl.txt # This program runs on the command line. # It reads from standard input and writes to standard output. # Use < > and | to change this behaviour. # Treatment of printable characters with non-standard assignments: # pound (£, code 96) and copyright (©, code 127) are rendered as such, unless -ascii flag is set; # up-arrow (code 94) is rendered as caret (^) by default, or as unicode up-arrow. # The backtick (`) is used as the escape character, # since it is the only ASCII printable character that does not # appear in the ZX Spectrum character set. # Escape sequences are *enclosed* in backticks like quotes. # Control characters are turned into `cn`, where n = decimal character code. # However, these should not normally appear in a program. # Colour control sequences are rendered as `INK1`, `PAPER5` etc. # Block graphics are interpreted as ASCII-art escape sequences (default) or as Unicode characters or escape sequences. # User-defined graphics are escaped as `gA`, where A is the UDG letter. # Tokens are expanded with spacing as in a real ZX Spectrum program. # Flags # -ascii: only ASCII characters are used. £ => `pound`, © => `copyright` # -nocolorcodes: discard color controls # -nocontrolcodes: discard control characters # -128: use 128 Spectrum character set with SPECTRUM and PLAY # -arrow: Use Unicode up-arrow instead of caret for character code 94 # -blockAscii: Interpret block graphics as AsciiArt escape sequences (default) # -blockUnicode: Interpret block graphics as the appropriate Unicode characters # -blockUnicodeEscape: Interpret block graphics the appropriate Unicode characters sequences, in the form `u00A0` use Switch; # errors $ERROR_BADARG = 65; $ERROR_INVALID_PROGRAM = 66; # flags @flags = @ARGV; $flag_ascii = 0; $flag_color = $flag_control = 1; $flag_128 = 0; $flag_blockGraphicAscii = 1; $flag_blockGraphicUnicode = 0; $flag_blockGraphicUnicodeEscape = 0; $flag_arrow = 0; $i = 0; foreach( @flags ) { switch($flags[$i]) { case '-ascii' { $flag_ascii = 1; } case '-nocolorcodes' { $flag_color = 0; } case '-nocontrolcodes' { $flag_control = 0; } case '-128' { $flag_128 = 1; } case '-arrow' { $flag_arrow = 1; } case '-blockAscii' { # block graphics as AsciiArt escapes $flag_blockGraphicAscii = 1; $flag_blockGraphicUnicode = 0; $flag_blockGraphicUnicodeEscape = 0; } case '-blockUnicode' { # block graphics as Unicode characters $flag_blockGraphicAscii = 0; $flag_blockGraphicUnicode = 1; $flag_blockGraphicUnicodeEscape = 0; } case '-blockUnicodeEscape' { # block graphics as Unicode escapes $flag_blockGraphicAscii = 0; $flag_blockGraphicUnicode = 0; $flag_blockGraphicUnicodeEscape = 1; } else { print STDERR "Usage: $0 [-ascii] [-nocolorcodes] [-nocontrolcodes] [-128] [-arrow] [-blockAscii] [-blockUnicode] [-blockUnicodeEscape]"; exit $ERROR_BADARG; } } } # constants #last nontoken character varies depending on whether program is in 48K or 128K mode $LAST_NONTOKEN_CHARACTER = ($flag_128?162:164); # characters for pound and copyright symbols in output character set $POUND = "£"; $COPYRIGHT = "©"; $PROGRAM = 0; $BYTES = 1; $ARRAY = 2; %colourControls = ( 16 => "INK", 17 => "PAPER", 18 => "BRIGHT", 19 => "FLASH", 20 => "INVERSE", 21 => "OVER" # included for completeness but can never actually appear ); %blockGraphics = ( 128 => "[ ]", 129 => "[ ']", 130 => "[' ]", 131 => "['']", 132 => "[ .]", 133 => "[ :]", 134 => "['.]", 135 => "[':]", 136 => "[. ]", 137 => "[.']", 138 => "[: ]", 139 => "[:']", 140 => "[..]", 141 => "[.:]", 142 => "[:.]", 143 => "[::]" ); %blockGraphicsUnicodeEscape = ( 128 => "`u00A0`", # alternate space char in unicode, to ensure round-trip convertibility 129 => "`u259D`", 130 => "`u2598`", 131 => "`u2580`", 132 => "`u2597`", 133 => "`u25A0`", 134 => "`u259A`", 135 => "`u259C`", 136 => "`u2596`", 137 => "`u259E`", 138 => "`u258D`", 139 => "`u259B`", 140 => "`u2584`", 141 => "`u259F`", 142 => "`u2599`", 143 => "`u2588`" ); %blockGraphicsUnicode = ( 128 => "\xA0", 129 => '▝', 130 => '▘', 131 => '▀', 132 => '▗', 133 => '▐', 134 => '▚', 135 => '▜', 136 => '▖', 137 => '▞', 138 => '▌', 139 => '▛', 140 => '▄', 141 => '▟', 142 => '▙', 143 => '█' ); $programPosition = $linePosition = $programLine = 0; $discard = $numberPosition = $linePosition = $inNumber = $inColor = $lineLength = 0; $fileLine = 0; $inProgram = 0; $program = ""; $inProgramHeader = $inProgramName = $programHeaderCounter = $programNameCounter = 0; $programName = ""; # Get the program name, put it on a single string. Newlines in the input string have no relevance to # newlines in a ZX Spectrum program, hence the chaining of "lines" in the input text. while ($fileLine = ) { $program .= $fileLine; } # if( substr($program, 0, 4) ne "\x13\x00\x00\x00") { # print STDERR "Error: Not a valid ZX Spectrum program file.\n"; # exit $ERROR_INVALID_PROGRAM; # } # print STDOUT "Program: " . substr($program, 4, 10); # program name # print "\n"; # $program = substr($program, 24); # truncate header $lineNumber = $lastLineNumber = 0; $line = ""; $length = length($program); for ($i=0; $i<$length; ++$i) { $lastChar = $char; $char = substr($program, $programPosition++, 1); $asciiValue = ord($char); $lastAsciiValue = ord($lastChar); $lastLineNumber = $lineNumber; if( $inProgram ) { # First two bytes represent program line if ($linePosition == 0) { if( $asciiValue > 39 ) { $inProgram = 0 } #Line number > 9999 => not a valid program line else { $lineNumber = 256*$asciiValue } # first (most significant) byte of line number } elsif ($linePosition == 1) { $lineNumber += $asciiValue; # second (least significant) byte of line number if($lineNumber <= 9999) {$line .= $lineNumber . "\t";} else { # not in a program: go back to previous symbol in stream $inProgram = 0; $programPosition--; } } elsif ($linePosition == 3) { $lineLength += 256*$asciiValue # first (least significant) byte of line length } elsif ($linePosition == 2) { $lineLength = $asciiValue; # second (most significant) byte of line length } elsif ($numberPosition == 4) { # end of a number $inNumber = $numberPosition = 0; } elsif ($inNumber) { $numberPosition++; } elsif ($inColour) { $inColour = 0; $line .= $asciiValue."`"; } elsif( $asciiValue >= 16 && $asciiValue <= 21 && $flag_color ) { # Colour control $inColour = 1; $line .= "`col".$colourControls{$asciiValue}; } elsif ($asciiValue == 14) { # start of a number $inNumber = 1; } elsif ($asciiValue == 13) { # newline $programLine++; $line .= "\n"; if($linePosition != $lineLength+3) { # Check that line length is correct; if not then not a valid program line. # May contain a program header; this begins with the sequence 13h 00h 00h 00h followed by program name. # $discard = 1; print STDERR "Warning: error in program line\n" } if (!($discard)) { print STDOUT $line; } $line = ""; $discard = 0; $linePosition = -1; } elsif ($asciiValue < 32) { # non-printable control character $line .= "`c$asciiValue`" if $flag_control; } elsif ($asciiValue > $LAST_NONTOKEN_CHARACTER) { # is a token $line .= expand($asciiValue, $lastAsciiValue); } elsif ($asciiValue == 127) { # copyright symbol in ZX Spectrum charset if( $flag_ascii) { $line .= "`copyright`"; } else { $line .= $COPYRIGHT; } } elsif ($asciiValue == 96) { # pound symbol in ZX Spectrum charset if( $flag_ascii) { $line .= "`pound`"; # double because the backtick is used as an escape char } else { $line .= $POUND; } } elsif ($asciiValue == ord('^') && $flag_arrow) { # up arrow for caret (as in ZX Specturm charset $line .= '↑'; } elsif ($asciiValue >= 128 && $asciiValue <= 143) { # Block graphics $$line .= "`b".$blockGraphics{$asciiValue}."`" if $flag_blockGraphicAscii; $$line .= $blockGraphicsUnicode{$asciiValue} if $flag_blockGraphicUnicode; $$line .= $blockGraphicsUnicodeEscape{$asciiValue} if $flag_blockGraphicUnicode; } elsif ($asciiValue >= 144 && $asciiValue < $LAST_NONTOKEN_CHARACTER) { # UDGs # A UDG appears as `gA`, where A is the corresponding letter $line .= "`g".chr($asciiValue-79)."`"; } else { # other ZX Spectrum printable character. Not necessarily a printable character in the destination charset. $line .= $char; } $linePosition++; } else { # not in program # look for next program header if($asciiValue == 19) { # this may mark next header $inProgramHeader = 1; } elsif( $inProgramHeader ) { if( ++$programHeaderCounter < 3 && $asciiValue != 0) { $inProgramHeader = $programHeaderCounter = 0 # false alarm, must have been something else } elsif( $programHeaderCounter == 3 ) { # start of program name $inProgramName = 1; } elsif( $inProgramName ) { if( ++$programNameCounter <= 10 ) { $programName .= $char; } elsif ( $programNameCounter == 11 ) { $inProgramName = 0; $fileType = "Program"; $fileType = "Bytes" if( $asciiValue == 168 ); $fileType = "Array" if( $asciiValue == 44 ); } } elsif ($programHeaderCounter == 23 ) { print STDOUT "\n$fileType: $programName\n"; $inProgram = 1 if $fileType eq "Program"; $linePosition = 0; # Reset counters and name $inProgramHeader = $inProgramName = $programHeaderCounter = $programNameCounter = 0; $programName = ""; } } } } sub expand { my ($code, $lastCode) = @_; my $spaceBefore, $spaceAfter; # Array holding tokens. my @tokens = ( 'SPECTRUM', 'PLAY', 'RND', 'INKEY$', 'PI','FN', 'POINT', 'SCREEN$', 'ATTR', #163-171: 128-specific tokens, random, keyboard and screen manipulation functions 'AT', 'TAB', 'VAL$', 'CODE', 'VAL', 'LEN', 'SIN', 'COS', 'TAN', 'ASN', 'ACS', 'ATN', #172-183: PRINT position control tokens, trig functions 'LN', 'EXP', 'INT', 'SQR', 'SGN', 'ABS', 'PEEK', 'IN', 'USR', 'STR$', 'CHR$', #184-194: math, system and string functions 'NOT', 'BIN', 'OR', 'AND', '<=', '>=', '<>', 'LINE', 'THEN', 'TO', 'STEP', #195-205: boolean and comparison operators, control flow tokens 'DEF FN', 'CAT', 'FORMAT', 'MOVE', 'ERASE', 'OPEN #', 'CLOSE #', 'MERGE', 'VERIFY', #206-214: U-D function, file and stream manipulation statements 'BEEP', 'CIRCLE', 'INK', 'PAPER', 'FLASH', 'BRIGHT', 'INVERSE', 'OVER', #215-222: sound, graphics and colour manipulation statements 'OUT', 'LPRINT', 'LLIST', 'STOP', 'READ', 'DATA', 'RESTORE', #223-229: port, printer, control flow and data statements 'NEW', 'BORDER', 'CONTINUE', 'DIM', 'REM', 'FOR', 'GO TO', 'GO SUB', 'INPUT', #230-238: K-cursor statements on keys A-I 'LOAD', 'LIST', 'LET', 'PAUSE', 'NEXT', 'POKE', 'PRINT', 'PLOT', 'RUN', #239-247: K-cursor statements on keys J-R 'SAVE', 'RANDOMIZE', 'IF', 'CLS', 'CLEAR', 'RETURN', 'COPY' #249-255: K-cursor statements on keys S-Z ); # Determine spacing of tokens. # Statements, Boolean binary operators (OR, AND) and control flow tokens (LINE, THEN, TO, STEP) have a space before. sub hasSpaceBefore { my $Ccode = shift; return ($Ccode == 163 || $Ccode == 164 || $Ccode == 197 || $Ccode == 198 || $Ccode>=202); } # All but RND, INKEY$, PI, <=, >=, <>, OPEN #, CLOSE# have a space after. sub hasSpaceAfter { my $Ccode = shift; return ($Ccode == 163 || $Ccode == 164 || $Ccode>=168 && $Ccode<=198 || $Ccode>=202 && $Ccode != 211 && $Ccode != 212); } # No space before a statement if previous token has a space after it. $spaceBefore = hasSpaceBefore($code) && !(hasSpaceAfter($lastCode)); $spaceAfter = hasSpaceAfter($code); return "" if( $code <= 163); # return empty string if code doesn't correspond to a token return ($spaceBefore?" ":"") . $tokens[$code-163] . ($spaceAfter?" ":""); }