From d6f191e88812bbadce446f67f88834af9c697e4a Mon Sep 17 00:00:00 2001
From: John Naylor <jcnaylor@gmail.com>
Date: Sun, 11 Mar 2018 16:18:10 +0700
Subject: [PATCH] Create infrastructure for working with the new data files

Remove data parsing from the original Catalogs() function and rename it to
ParseHeader() to reflect its new, limited role of extracting the schema
info from a single header. The new data files are handled by the new
function ParseData(). Having these functions work with only one file at a
time requires their callers to do more work, but results in a cleaner design.

rewrite_dat.pl reads in pg_*.dat files and rewrites them in a standard
format. It writes attributes in order, preserves comments and folds
consecutive blank lines. The meta-attributes oid, oid_symbol and (sh)descr
are on their own line, if present.
---
 src/backend/catalog/Catalog.pm     | 219 +++++++++++++++++--------------------
 src/include/catalog/rewrite_dat.pl | 172 +++++++++++++++++++++++++++++
 2 files changed, 275 insertions(+), 116 deletions(-)
 create mode 100644 src/include/catalog/rewrite_dat.pl

diff --git a/src/backend/catalog/Catalog.pm b/src/backend/catalog/Catalog.pm
index 9ced154..dc6e0d3 100644
--- a/src/backend/catalog/Catalog.pm
+++ b/src/backend/catalog/Catalog.pm
@@ -1,7 +1,7 @@
 #----------------------------------------------------------------------
 #
 # Catalog.pm
-#    Perl module that extracts info from catalog headers into Perl
+#    Perl module that extracts info from catalog files into Perl
 #    data structures
 #
 # Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
@@ -16,12 +16,11 @@ package Catalog;
 use strict;
 use warnings;

-# Call this function with an array of names of header files to parse.
-# Returns a nested data structure describing the data in the headers.
-sub Catalogs
+# Parses a catalog header file into a data structure describing the schema
+# of the catalog.
+sub ParseHeader
 {
-	my (%catalogs, $catname, $declaring_attributes, $most_recent);
-	$catalogs{names} = [];
+	my $input_file = shift;

 	# There are a few types which are given one name in the C source, but a
 	# different name at the SQL level.  These are enumerated here.
@@ -34,19 +33,16 @@ sub Catalogs
 		'TransactionId' => 'xid',
 		'XLogRecPtr'    => 'pg_lsn');

-	foreach my $input_file (@_)
-	{
 		my %catalog;
+		my $declaring_attributes = 0;
 		my $is_varlen     = 0;

 		$catalog{columns} = [];
-		$catalog{data}    = [];
+		$catalog{toasting} = [];
+		$catalog{indexing} = [];

 		open(my $ifh, '<', $input_file) || die "$input_file: $!";

-		my ($filename) = ($input_file =~ m/(\w+)\.h$/);
-		my $natts_pat = "Natts_$filename";
-
 		# Scan the input file.
 		while (<$ifh>)
 		{
@@ -64,9 +60,6 @@ sub Catalogs
 				redo;
 			}

-			# Remember input line number for later.
-			my $input_line_number = $.;
-
 			# Strip useless whitespace and trailing semicolons.
 			chomp;
 			s/^\s+//;
@@ -74,68 +67,17 @@ sub Catalogs
 			s/\s+/ /g;

 			# Push the data into the appropriate data structure.
-			if (/$natts_pat\s+(\d+)/)
-			{
-				$catalog{natts} = $1;
-			}
-			elsif (
-				/^DATA\(insert(\s+OID\s+=\s+(\d+))?\s+\(\s*(.*)\s*\)\s*\)$/)
-			{
-				check_natts($filename, $catalog{natts}, $3, $input_file,
-					$input_line_number);
-
-				push @{ $catalog{data} }, { oid => $2, bki_values => $3 };
-			}
-			elsif (/^DESCR\(\"(.*)\"\)$/)
-			{
-				$most_recent = $catalog{data}->[-1];
-
-				# this tests if most recent line is not a DATA() statement
-				if (ref $most_recent ne 'HASH')
-				{
-					die "DESCR() does not apply to any catalog ($input_file)";
-				}
-				if (!defined $most_recent->{oid})
-				{
-					die "DESCR() does not apply to any oid ($input_file)";
-				}
-				elsif ($1 ne '')
-				{
-					$most_recent->{descr} = $1;
-				}
-			}
-			elsif (/^SHDESCR\(\"(.*)\"\)$/)
+			if (/^DECLARE_TOAST\(\s*(\w+),\s*(\d+),\s*(\d+)\)/)
 			{
-				$most_recent = $catalog{data}->[-1];
-
-				# this tests if most recent line is not a DATA() statement
-				if (ref $most_recent ne 'HASH')
-				{
-					die
-					  "SHDESCR() does not apply to any catalog ($input_file)";
-				}
-				if (!defined $most_recent->{oid})
-				{
-					die "SHDESCR() does not apply to any oid ($input_file)";
-				}
-				elsif ($1 ne '')
-				{
-					$most_recent->{shdescr} = $1;
-				}
-			}
-			elsif (/^DECLARE_TOAST\(\s*(\w+),\s*(\d+),\s*(\d+)\)/)
-			{
-				$catname = 'toasting';
 				my ($toast_name, $toast_oid, $index_oid) = ($1, $2, $3);
-				push @{ $catalog{data} },
+				push @{ $catalog{toasting} },
 				  "declare toast $toast_oid $index_oid on $toast_name\n";
 			}
 			elsif (/^DECLARE_(UNIQUE_)?INDEX\(\s*(\w+),\s*(\d+),\s*(.+)\)/)
 			{
-				$catname = 'indexing';
 				my ($is_unique, $index_name, $index_oid, $using) =
 				  ($1, $2, $3, $4);
-				push @{ $catalog{data} },
+				push @{ $catalog{indexing} },
 				  sprintf(
 					"declare %sindex %s %s %s\n",
 					$is_unique ? 'unique ' : '',
@@ -143,16 +85,13 @@ sub Catalogs
 			}
 			elsif (/^BUILD_INDICES/)
 			{
-				push @{ $catalog{data} }, "build indices\n";
+				push @{ $catalog{indexing} }, "build indices\n";
 			}
 			elsif (/^CATALOG\(([^,]*),(\d+)\)/)
 			{
-				$catname = $1;
+				$catalog{catname} = $1;
 				$catalog{relation_oid} = $2;

-				# Store pg_* catalog names in the same order we receive them
-				push @{ $catalogs{names} }, $catname;
-
 				$catalog{bootstrap} = /BKI_BOOTSTRAP/ ? ' bootstrap' : '';
 				$catalog{shared_relation} =
 				  /BKI_SHARED_RELATION/ ? ' shared_relation' : '';
@@ -173,7 +112,7 @@ sub Catalogs
 				}
 				if (/^}/)
 				{
-					undef $declaring_attributes;
+					$declaring_attributes = 0;
 				}
 				else
 				{
@@ -227,32 +166,99 @@ sub Catalogs
 				}
 			}
 		}
-		$catalogs{$catname} = \%catalog;
 		close $ifh;
-	}
-	return \%catalogs;
+	return \%catalog;
 }

-# Split a DATA line into fields.
-# Call this on the bki_values element of a DATA item returned by Catalogs();
-# it returns a list of field values.  We don't strip quoting from the fields.
-# Note: it should be safe to assign the result to a list of length equal to
-# the nominal number of catalog fields, because check_natts already checked
-# the number of fields.
-sub SplitDataLine
+# Parses a file containing Perl data structure literals, returning live data.
+#
+# The parameter $preserve_formatting needs to be set for callers that want
+# to work with non-data lines in the data files, such as comments and blank
+# lines. If a caller just wants consume the data, leave it unset.
+sub ParseData
 {
-	my $bki_values = shift;
-
-	# This handling of quoted strings might look too simplistic, but it
-	# matches what bootscanner.l does: that has no provision for quote marks
-	# inside quoted strings, either.  If we don't have a quoted string, just
-	# snarf everything till next whitespace.  That will accept some things
-	# that bootscanner.l will see as erroneous tokens; but it seems wiser
-	# to do that and let bootscanner.l complain than to silently drop
-	# non-whitespace characters.
-	my @result = $bki_values =~ /"[^"]*"|\S+/g;
-
-	return @result;
+	my ($input_file, $schema, $preserve_formatting) = @_;
+
+	open(my $ifd, '<', $input_file) || die "$input_file: $!";
+	$input_file =~ /\w+\.dat$/
+	  or die "Input file needs to be a .dat file.\n";
+	my $data = [];
+	my $prev_blank = 0;
+
+	# Scan the input file.
+	while (<$ifd>)
+	{
+		my $datum;
+
+		if (/^\s*$/)
+		{
+			# Preserve non-consecutive blank lines.
+			# Newline gets added by caller.
+			next if $prev_blank;
+			$datum = '';
+			$prev_blank = 1;
+		}
+		else
+		{
+			$prev_blank = 0;
+		}
+
+		if (/{/)
+		{
+			# Capture the hash ref
+			# NB: Assumes that the next hash ref can't start on the
+			# same line where the present one ended.
+			# Not foolproof, but we shouldn't need a full parser,
+			# since we expect relatively well-behaved input.
+
+			# Quick hack to detect when we have a full hash ref to
+			# parse. We can't just use a regex because of values in
+			# pg_aggregate and pg_proc like '{0,0}'.
+			my $lcnt = tr/{//;
+			my $rcnt = tr/}//;
+
+			if ($lcnt == $rcnt)
+			{
+				eval '$datum = ' . $_;
+				if (!ref $datum)
+				{
+					die "Error parsing $_\n$!";
+				}
+			}
+			else
+			{
+				my $next_line = <$ifd>;
+				die "$input_file: ends within Perl hash\n"
+				  if !defined $next_line;
+				$_ .= $next_line;
+				redo;
+			}
+		}
+
+		# Capture comments that are on their own line.
+		elsif (/^\s*(#.*?)\s*$/)
+		{
+			$datum = $1;
+		}
+
+		# Assume bracket is the only token in the line.
+		elsif (/^\s*(\[|\])\s*$/)
+		{
+			$datum = $1;
+		}
+
+		next if !defined $datum;
+
+		# Hash references are data, so always push.
+		# Other datums are non-data strings, so only push if we
+		# want formatting.
+		if ($preserve_formatting or ref $datum eq 'HASH')
+		{
+			push @$data, $datum;
+		}
+	}
+	close $ifd;
+	return $data;
 }

 # Fill in default values of a record using the given schema. It's the
@@ -308,7 +314,6 @@ sub RenameTempFile
 	rename($temp_name, $final_name) || die "rename: $temp_name: $!";
 }

-
 # Find a symbol defined in a particular header file and extract the value.
 #
 # The include path has to be passed as a reference to an array.
@@ -340,22 +345,4 @@ sub FindDefinedSymbol
 	die "$catalog_header: not found in any include directory\n";
 }

-
-# verify the number of fields in the passed-in DATA line
-sub check_natts
-{
-	my ($catname, $natts, $bki_val, $file, $line) = @_;
-
-	die
-"Could not find definition for Natts_${catname} before start of DATA() in $file\n"
-	  unless defined $natts;
-
-	my $nfields = scalar(SplitDataLine($bki_val));
-
-	die sprintf
-"Wrong number of attributes in DATA() entry at %s:%d (expected %d but got %d)\n",
-	  $file, $line, $natts, $nfields
-	  unless $natts == $nfields;
-}
-
 1;
diff --git a/src/include/catalog/rewrite_dat.pl b/src/include/catalog/rewrite_dat.pl
new file mode 100644
index 0000000..614ec01
--- /dev/null
+++ b/src/include/catalog/rewrite_dat.pl
@@ -0,0 +1,172 @@
+#!/usr/bin/perl -w
+#----------------------------------------------------------------------
+#
+# rewrite_dat.pl
+#    Perl script that reads in a catalog data file and writes out
+#    a functionally equivalent file in a standard format.
+#
+#    -Metadata entries are on their own line within the data entry.
+#    -Attributes are in the same order they would be in the catalog table.
+#
+# Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+# Portions Copyright (c) 1994, Regents of the University of California
+#
+# /src/include/catalog/rewrite_dat.pl
+#
+#----------------------------------------------------------------------
+
+use Catalog;
+
+use strict;
+use warnings;
+
+my @input_files;
+my $output_path = '';
+
+# Process command line switches.
+while (@ARGV)
+{
+	my $arg = shift @ARGV;
+	if ($arg !~ /^-/)
+	{
+		push @input_files, $arg;
+	}
+	elsif ($arg =~ /^-o/)
+	{
+		$output_path = length($arg) > 2 ? substr($arg, 2) : shift @ARGV;
+	}
+	else
+	{
+		usage();
+	}
+}
+
+# Sanity check arguments.
+die "No input files.\n"
+  if !@input_files;
+
+# Make sure output_path ends in a slash.
+if ($output_path ne '' && substr($output_path, -1) ne '/')
+{
+	$output_path .= '/';
+}
+
+# Metadata of a catalog entry
+my @METADATA = ('oid', 'oid_symbol', 'descr', 'shdescr');
+
+# Read all the input files into internal data structures.
+# We pass data file names as arguments and then look for matching
+# headers to parse the schema from.
+foreach my $datfile (@input_files)
+{
+	$datfile =~ /(.+)\.dat$/
+	  or die "Input files need to be data (.dat) files.\n";
+
+	my $header = "$1.h";
+	die "There in no header file corresponding to $datfile"
+	  if ! -e $header;
+
+	my @attnames;
+	my $catalog = Catalog::ParseHeader($header);
+	my $catname = $catalog->{catname};
+	my $schema  = $catalog->{columns};
+
+	foreach my $column (@$schema)
+	{
+		my $attname = $column->{name};
+		push @attnames, $attname;
+	}
+
+	my $catalog_data = Catalog::ParseData($datfile, $schema, 1);
+	next if !defined $catalog_data;
+
+	# Overwrite .dat files in place.
+	my $datfile = "$output_path$catname.dat";
+	open my $dat, '>', $datfile
+	  or die "can't open $datfile: $!";
+
+	# Write the data.
+	foreach my $data (@$catalog_data)
+	{
+		# Either a newline, comment, or bracket - just write it out.
+		if (! ref $data)
+		{
+			print $dat "$data\n";
+		}
+		# Hash ref representing a data entry.
+		elsif (ref $data eq 'HASH')
+		{
+			my %values = %$data;
+			print $dat "{ ";
+
+			# Separate out metadata fields for readability.
+			my $metadata_line = format_line(\%values, @METADATA);
+			if ($metadata_line)
+			{
+				print $dat $metadata_line;
+				print $dat ",\n";
+			}
+			my $data_line = format_line(\%values, @attnames);
+
+			# Line up with metadata line, if there is one.
+			if ($metadata_line)
+			{
+				print $dat '  ';
+			}
+			print $dat $data_line;
+			print $dat " },\n";
+		}
+		else
+		{
+			die "Unexpected data type";
+		}
+	}
+}
+
+# Format the individual elements of a Perl hash into a valid string
+# representation. We do this ourselves, rather than use native Perl
+# facilities, so we can keep control over the exact formatting of the
+# data files.
+sub format_line
+{
+	my $data = shift;
+	my @attnames = @_;
+
+	my $first = 1;
+	my $value;
+	my $line = '';
+
+	foreach my $attname (@attnames)
+	{
+		next if !defined $data->{$attname};
+		$value = $data->{$attname};
+
+		# Re-escape single quotes.
+		$value =~ s/'/\\'/g;
+
+		if (!$first)
+		{
+			$line .= ', ';
+		}
+		$first = 0;
+
+		$line .= "$attname => '$value'";
+	}
+	return $line;
+}
+
+sub usage
+{
+	die <<EOM;
+Usage: rewrite_dat.pl [options] datafile...
+
+Options:
+    -o               output path
+
+Expects a list of .dat files as arguments.
+
+Make sure location of Catalog.pm is passed to the perl interpreter:
+perl -I /path/to/Catalog.pm/ ...
+
+EOM
+}
-- 
2.7.4