From 18a4540e7a108949cb2fc4aa292a41ad22e29394 Mon Sep 17 00:00:00 2001
From: John Naylor <jcnaylor@gmail.com>
Date: Mon, 15 Jan 2018 10:19:30 +0700
Subject: [PATCH] Create data conversion infrastructure

Remove data parsing from the original Catalogs() function and rename it
to ParseHeader() to reflect its new, limited role of extracting the
schema info from a single header. The new data files are handled by the
new function ParseData(). Having these functions work with only one file
at a time requires their callers to do more work, but results in a cleaner
design.

rewrite_dat.pl reads in pg_*.dat files and rewrites them in a standard
format. It writes attributes in order, preserves comments and folds
consecutive blank lines. The meta-attributes oid, oid_symbol and
(sh)descr are on their own line, if present.
---
 src/backend/catalog/Catalog.pm     | 227 ++++++++++++++++++-------------------
 src/include/catalog/rewrite_dat.pl | 200 ++++++++++++++++++++++++++++++++
 2 files changed, 311 insertions(+), 116 deletions(-)
 create mode 100644 src/include/catalog/rewrite_dat.pl

diff --git a/src/backend/catalog/Catalog.pm b/src/backend/catalog/Catalog.pm
index 9ced154..60e641e 100644
--- a/src/backend/catalog/Catalog.pm
+++ b/src/backend/catalog/Catalog.pm
@@ -1,7 +1,7 @@
 #----------------------------------------------------------------------
 #
 # Catalog.pm
-#    Perl module that extracts info from catalog headers into Perl
+#    Perl module that extracts info from catalog files into Perl
 #    data structures
 #
 # Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
@@ -16,12 +16,11 @@ package Catalog;
 use strict;
 use warnings;

-# Call this function with an array of names of header files to parse.
-# Returns a nested data structure describing the data in the headers.
-sub Catalogs
+# Parses a catalog header file into a data structure describing the schema
+# of the catalog.
+sub ParseHeader
 {
-	my (%catalogs, $catname, $declaring_attributes, $most_recent);
-	$catalogs{names} = [];
+	my $input_file = shift;

 	# There are a few types which are given one name in the C source, but a
 	# different name at the SQL level.  These are enumerated here.
@@ -34,19 +33,16 @@ sub Catalogs
 		'TransactionId' => 'xid',
 		'XLogRecPtr'    => 'pg_lsn');

-	foreach my $input_file (@_)
-	{
 		my %catalog;
+		my $declaring_attributes = 0;
 		my $is_varlen     = 0;

 		$catalog{columns} = [];
-		$catalog{data}    = [];
+		$catalog{toasting} = [];
+		$catalog{indexing} = [];

 		open(my $ifh, '<', $input_file) || die "$input_file: $!";

-		my ($filename) = ($input_file =~ m/(\w+)\.h$/);
-		my $natts_pat = "Natts_$filename";
-
 		# Scan the input file.
 		while (<$ifh>)
 		{
@@ -64,9 +60,6 @@ sub Catalogs
 				redo;
 			}

-			# Remember input line number for later.
-			my $input_line_number = $.;
-
 			# Strip useless whitespace and trailing semicolons.
 			chomp;
 			s/^\s+//;
@@ -74,68 +67,17 @@ sub Catalogs
 			s/\s+/ /g;

 			# Push the data into the appropriate data structure.
-			if (/$natts_pat\s+(\d+)/)
-			{
-				$catalog{natts} = $1;
-			}
-			elsif (
-				/^DATA\(insert(\s+OID\s+=\s+(\d+))?\s+\(\s*(.*)\s*\)\s*\)$/)
-			{
-				check_natts($filename, $catalog{natts}, $3, $input_file,
-					$input_line_number);
-
-				push @{ $catalog{data} }, { oid => $2, bki_values => $3 };
-			}
-			elsif (/^DESCR\(\"(.*)\"\)$/)
-			{
-				$most_recent = $catalog{data}->[-1];
-
-				# this tests if most recent line is not a DATA() statement
-				if (ref $most_recent ne 'HASH')
-				{
-					die "DESCR() does not apply to any catalog ($input_file)";
-				}
-				if (!defined $most_recent->{oid})
-				{
-					die "DESCR() does not apply to any oid ($input_file)";
-				}
-				elsif ($1 ne '')
-				{
-					$most_recent->{descr} = $1;
-				}
-			}
-			elsif (/^SHDESCR\(\"(.*)\"\)$/)
-			{
-				$most_recent = $catalog{data}->[-1];
-
-				# this tests if most recent line is not a DATA() statement
-				if (ref $most_recent ne 'HASH')
-				{
-					die
-					  "SHDESCR() does not apply to any catalog ($input_file)";
-				}
-				if (!defined $most_recent->{oid})
-				{
-					die "SHDESCR() does not apply to any oid ($input_file)";
-				}
-				elsif ($1 ne '')
-				{
-					$most_recent->{shdescr} = $1;
-				}
-			}
-			elsif (/^DECLARE_TOAST\(\s*(\w+),\s*(\d+),\s*(\d+)\)/)
+			if (/^DECLARE_TOAST\(\s*(\w+),\s*(\d+),\s*(\d+)\)/)
 			{
-				$catname = 'toasting';
 				my ($toast_name, $toast_oid, $index_oid) = ($1, $2, $3);
-				push @{ $catalog{data} },
+				push @{ $catalog{toasting} },
 				  "declare toast $toast_oid $index_oid on $toast_name\n";
 			}
 			elsif (/^DECLARE_(UNIQUE_)?INDEX\(\s*(\w+),\s*(\d+),\s*(.+)\)/)
 			{
-				$catname = 'indexing';
 				my ($is_unique, $index_name, $index_oid, $using) =
 				  ($1, $2, $3, $4);
-				push @{ $catalog{data} },
+				push @{ $catalog{indexing} },
 				  sprintf(
 					"declare %sindex %s %s %s\n",
 					$is_unique ? 'unique ' : '',
@@ -143,16 +85,13 @@ sub Catalogs
 			}
 			elsif (/^BUILD_INDICES/)
 			{
-				push @{ $catalog{data} }, "build indices\n";
+				push @{ $catalog{indexing} }, "build indices\n";
 			}
 			elsif (/^CATALOG\(([^,]*),(\d+)\)/)
 			{
-				$catname = $1;
+				$catalog{catname} = $1;
 				$catalog{relation_oid} = $2;

-				# Store pg_* catalog names in the same order we receive them
-				push @{ $catalogs{names} }, $catname;
-
 				$catalog{bootstrap} = /BKI_BOOTSTRAP/ ? ' bootstrap' : '';
 				$catalog{shared_relation} =
 				  /BKI_SHARED_RELATION/ ? ' shared_relation' : '';
@@ -173,7 +112,7 @@ sub Catalogs
 				}
 				if (/^}/)
 				{
-					undef $declaring_attributes;
+					$declaring_attributes = 0;
 				}
 				else
 				{
@@ -227,32 +166,107 @@ sub Catalogs
 				}
 			}
 		}
-		$catalogs{$catname} = \%catalog;
 		close $ifh;
-	}
-	return \%catalogs;
+	return \%catalog;
 }

-# Split a DATA line into fields.
-# Call this on the bki_values element of a DATA item returned by Catalogs();
-# it returns a list of field values.  We don't strip quoting from the fields.
-# Note: it should be safe to assign the result to a list of length equal to
-# the nominal number of catalog fields, because check_natts already checked
-# the number of fields.
-sub SplitDataLine
+# Parses a file containing Perl data structure literals, returning live data.
+#
+# The parameter $preserve_formatting needs to be set for callers that want
+# to work with non-data lines in the data files, such as comments and blank
+# lines. If a caller just wants consume the data, leave it unset.
+sub ParseData
 {
-	my $bki_values = shift;
-
-	# This handling of quoted strings might look too simplistic, but it
-	# matches what bootscanner.l does: that has no provision for quote marks
-	# inside quoted strings, either.  If we don't have a quoted string, just
-	# snarf everything till next whitespace.  That will accept some things
-	# that bootscanner.l will see as erroneous tokens; but it seems wiser
-	# to do that and let bootscanner.l complain than to silently drop
-	# non-whitespace characters.
-	my @result = $bki_values =~ /"[^"]*"|\S+/g;
-
-	return @result;
+	my ($input_file, $schema, $preserve_formatting) = @_;
+
+	$input_file =~ /\w+\.dat$/
+	  or die "Input file needs to be a .dat file.\n";
+	my $data = [];
+
+	# Read entire file into a string and eval it.
+	if (!$preserve_formatting)
+	{
+		my $file_string = do
+		{
+			local $/ = undef;
+			open my $ifd, "<", $input_file or die "$input_file: $!";
+			<$ifd>;
+		};
+
+		eval '$data = ' . $file_string;
+		print "Error : $@\n" if $@;
+		return $data;
+	}
+
+	# When preserving formatting, we scan the file one line at a time
+	# and decide how to handle each item. We don't check too closely
+	# for valid syntax, since we assume it will be checked otherwise.
+	my $prev_blank = 0;
+	open(my $ifd, '<', $input_file) or die "$input_file: $!";
+	while (<$ifd>)
+	{
+		my $datum;
+
+		# Capture non-consecutive blank lines.
+		if (/^\s*$/)
+		{
+			next if $prev_blank;
+			$prev_blank = 1;
+
+			# Newline gets added by caller.
+			$datum = '';
+		}
+		else
+		{
+			$prev_blank = 0;
+		}
+
+		# Capture comments that are on their own line.
+		if (/^\s*(#.*?)\s*$/)
+		{
+			$datum = $1;
+		}
+
+		# Capture brackets that are on their own line.
+		elsif (/^\s*(\[|\])\s*$/)
+		{
+			$datum = $1;
+		}
+
+		# Capture hash references
+		# NB: Assumes that the next hash ref can't start on the
+		# same line where the present one ended.
+		# Not foolproof, but we shouldn't need a full parser,
+		# since we expect relatively well-behaved input.
+		elsif (/{/)
+		{
+			# Quick hack to detect when we have a full hash ref to
+			# parse. We can't just use a regex because of values in
+			# pg_aggregate and pg_proc like '{0,0}'.
+			my $lcnt = tr/{//;
+			my $rcnt = tr/}//;
+
+			if ($lcnt == $rcnt)
+			{
+				eval '$datum = ' . $_;
+				if (!ref $datum)
+				{
+					die "Error parsing $_\n$!";
+				}
+			}
+			else
+			{
+				my $next_line = <$ifd>;
+				die "$input_file: ends within Perl hash\n"
+				  if !defined $next_line;
+				$_ .= $next_line;
+				redo;
+			}
+		}
+		push @$data, $datum if defined $datum;
+	}
+	close $ifd;
+	return $data;
 }

 # Fill in default values of a record using the given schema. It's the
@@ -308,7 +322,6 @@ sub RenameTempFile
 	rename($temp_name, $final_name) || die "rename: $temp_name: $!";
 }

-
 # Find a symbol defined in a particular header file and extract the value.
 #
 # The include path has to be passed as a reference to an array.
@@ -340,22 +353,4 @@ sub FindDefinedSymbol
 	die "$catalog_header: not found in any include directory\n";
 }

-
-# verify the number of fields in the passed-in DATA line
-sub check_natts
-{
-	my ($catname, $natts, $bki_val, $file, $line) = @_;
-
-	die
-"Could not find definition for Natts_${catname} before start of DATA() in $file\n"
-	  unless defined $natts;
-
-	my $nfields = scalar(SplitDataLine($bki_val));
-
-	die sprintf
-"Wrong number of attributes in DATA() entry at %s:%d (expected %d but got %d)\n",
-	  $file, $line, $natts, $nfields
-	  unless $natts == $nfields;
-}
-
 1;
diff --git a/src/include/catalog/rewrite_dat.pl b/src/include/catalog/rewrite_dat.pl
new file mode 100644
index 0000000..e0d0d66
--- /dev/null
+++ b/src/include/catalog/rewrite_dat.pl
@@ -0,0 +1,200 @@
+#!/usr/bin/perl -w
+#----------------------------------------------------------------------
+#
+# rewrite_dat.pl
+#    Perl script that reads in a catalog data file and writes out
+#    a functionally equivalent file in a standard format.
+#
+#    -Metadata entries are on their own line within the data entry.
+#    -Attributes are in the same order they would be in the catalog table.
+#
+# Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+# Portions Copyright (c) 1994, Regents of the University of California
+#
+# /src/include/catalog/rewrite_dat.pl
+#
+#----------------------------------------------------------------------
+
+use Catalog;
+
+use strict;
+use warnings;
+
+my @input_files;
+my $output_path = '';
+
+# Process command line switches.
+while (@ARGV)
+{
+	my $arg = shift @ARGV;
+	if ($arg !~ /^-/)
+	{
+		push @input_files, $arg;
+	}
+	elsif ($arg =~ /^-o/)
+	{
+		$output_path = length($arg) > 2 ? substr($arg, 2) : shift @ARGV;
+	}
+	elsif ($arg eq '--revert')
+	{
+		revert();
+	}
+	else
+	{
+		usage();
+	}
+}
+
+# Sanity check arguments.
+die "No input files.\n"
+  if !@input_files;
+
+# Make sure output_path ends in a slash.
+if ($output_path ne '' && substr($output_path, -1) ne '/')
+{
+	$output_path .= '/';
+}
+
+# Metadata of a catalog entry
+my @METADATA = ('oid', 'oid_symbol', 'descr', 'shdescr');
+
+# Read all the input files into internal data structures.
+# We pass data file names as arguments and then look for matching
+# headers to parse the schema from.
+foreach my $datfile (@input_files)
+{
+	$datfile =~ /(.+)\.dat$/
+	  or die "Input files need to be data (.dat) files.\n";
+
+	my $header = "$1.h";
+	die "There in no header file corresponding to $datfile"
+	  if ! -e $header;
+
+	my @attnames;
+	my $catalog = Catalog::ParseHeader($header);
+	my $catname = $catalog->{catname};
+	my $schema  = $catalog->{columns};
+
+	foreach my $column (@$schema)
+	{
+		my $attname = $column->{name};
+		push @attnames, $attname;
+	}
+
+	my $catalog_data = Catalog::ParseData($datfile, $schema, 1);
+	next if !defined $catalog_data;
+
+	# Back up old data file rather than overwrite it. The input path and
+	# output path are normally the same, but we don't assume that.
+	my $newdatfile = "$output_path$catname.dat";
+	if (-e $newdatfile)
+	{
+		rename($newdatfile, $newdatfile . '.bak')
+		  or die "rename: $newdatfile: $!";
+	}
+	open my $dat, '>', $newdatfile
+	  or die "can't open $newdatfile: $!";
+
+	# Write the data.
+	foreach my $data (@$catalog_data)
+	{
+		# Either a newline, comment, or bracket - just write it out.
+		if (! ref $data)
+		{
+			print $dat "$data\n";
+		}
+		# Hash ref representing a data entry.
+		elsif (ref $data eq 'HASH')
+		{
+			my %values = %$data;
+			print $dat "{ ";
+
+			# Separate out metadata fields for readability.
+			my $metadata_line = format_line(\%values, @METADATA);
+			if ($metadata_line)
+			{
+				print $dat $metadata_line;
+				print $dat ",\n";
+			}
+			my $data_line = format_line(\%values, @attnames);
+
+			# Line up with metadata line, if there is one.
+			if ($metadata_line)
+			{
+				print $dat '  ';
+			}
+			print $dat $data_line;
+			print $dat " },\n";
+		}
+		else
+		{
+			die "Unexpected data type";
+		}
+	}
+}
+
+# Format the individual elements of a Perl hash into a valid string
+# representation. We do this ourselves, rather than use native Perl
+# facilities, so we can keep control over the exact formatting of the
+# data files.
+sub format_line
+{
+	my $data = shift;
+	my @atts = @_;
+
+	my $first = 1;
+	my $value;
+	my $line = '';
+
+	foreach my $field (@atts)
+	{
+		next if !defined $data->{$field};
+		$value = $data->{$field};
+
+		# Re-escape single quotes.
+		$value =~ s/'/\\'/g;
+
+		if (!$first)
+		{
+			$line .= ', ';
+		}
+		$first = 0;
+
+		$line .= "$field => '$value'";
+	}
+	return $line;
+}
+
+# Rename .bak files back to .dat
+# This requires passing the .dat files as arguments to the script as normal.
+# XXX This is of questionable utility, since the files are under version
+# control, after all.
+sub revert
+{
+	foreach my $datfile (@input_files)
+	{
+		my $bakfile = "$datfile.bak";
+		if (-e $bakfile)
+		{
+			rename($bakfile, $datfile) or die "rename: $bakfile: $!";
+		}
+	}
+	exit 0;
+}
+
+sub usage
+{
+	die <<EOM;
+Usage: rewrite_dat.pl [options] datafile...
+
+Options:
+    -o               output path
+    --revert         rename .bak files back to .dat
+
+Expects a list of .dat files as arguments.
+
+Make sure location of Catalog.pm is passed to the perl interpreter:
+perl -I /path/to/Catalog.pm/ ...
+
+EOM
+}
-- 
2.7.4