From 18a4540e7a108949cb2fc4aa292a41ad22e29394 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Mon, 15 Jan 2018 10:19:30 +0700 Subject: [PATCH] Create data conversion infrastructure Remove data parsing from the original Catalogs() function and rename it to ParseHeader() to reflect its new, limited role of extracting the schema info from a single header. The new data files are handled by the new function ParseData(). Having these functions work with only one file at a time requires their callers to do more work, but results in a cleaner design. rewrite_dat.pl reads in pg_*.dat files and rewrites them in a standard format. It writes attributes in order, preserves comments and folds consecutive blank lines. The meta-attributes oid, oid_symbol and (sh)descr are on their own line, if present. --- src/backend/catalog/Catalog.pm | 227 ++++++++++++++++++------------------- src/include/catalog/rewrite_dat.pl | 200 ++++++++++++++++++++++++++++++++ 2 files changed, 311 insertions(+), 116 deletions(-) create mode 100644 src/include/catalog/rewrite_dat.pl diff --git a/src/backend/catalog/Catalog.pm b/src/backend/catalog/Catalog.pm index 9ced154..60e641e 100644 --- a/src/backend/catalog/Catalog.pm +++ b/src/backend/catalog/Catalog.pm @@ -1,7 +1,7 @@ #---------------------------------------------------------------------- # # Catalog.pm -# Perl module that extracts info from catalog headers into Perl +# Perl module that extracts info from catalog files into Perl # data structures # # Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group @@ -16,12 +16,11 @@ package Catalog; use strict; use warnings; -# Call this function with an array of names of header files to parse. -# Returns a nested data structure describing the data in the headers. -sub Catalogs +# Parses a catalog header file into a data structure describing the schema +# of the catalog. +sub ParseHeader { - my (%catalogs, $catname, $declaring_attributes, $most_recent); - $catalogs{names} = []; + my $input_file = shift; # There are a few types which are given one name in the C source, but a # different name at the SQL level. These are enumerated here. @@ -34,19 +33,16 @@ sub Catalogs 'TransactionId' => 'xid', 'XLogRecPtr' => 'pg_lsn'); - foreach my $input_file (@_) - { my %catalog; + my $declaring_attributes = 0; my $is_varlen = 0; $catalog{columns} = []; - $catalog{data} = []; + $catalog{toasting} = []; + $catalog{indexing} = []; open(my $ifh, '<', $input_file) || die "$input_file: $!"; - my ($filename) = ($input_file =~ m/(\w+)\.h$/); - my $natts_pat = "Natts_$filename"; - # Scan the input file. while (<$ifh>) { @@ -64,9 +60,6 @@ sub Catalogs redo; } - # Remember input line number for later. - my $input_line_number = $.; - # Strip useless whitespace and trailing semicolons. chomp; s/^\s+//; @@ -74,68 +67,17 @@ sub Catalogs s/\s+/ /g; # Push the data into the appropriate data structure. - if (/$natts_pat\s+(\d+)/) - { - $catalog{natts} = $1; - } - elsif ( - /^DATA\(insert(\s+OID\s+=\s+(\d+))?\s+\(\s*(.*)\s*\)\s*\)$/) - { - check_natts($filename, $catalog{natts}, $3, $input_file, - $input_line_number); - - push @{ $catalog{data} }, { oid => $2, bki_values => $3 }; - } - elsif (/^DESCR\(\"(.*)\"\)$/) - { - $most_recent = $catalog{data}->[-1]; - - # this tests if most recent line is not a DATA() statement - if (ref $most_recent ne 'HASH') - { - die "DESCR() does not apply to any catalog ($input_file)"; - } - if (!defined $most_recent->{oid}) - { - die "DESCR() does not apply to any oid ($input_file)"; - } - elsif ($1 ne '') - { - $most_recent->{descr} = $1; - } - } - elsif (/^SHDESCR\(\"(.*)\"\)$/) - { - $most_recent = $catalog{data}->[-1]; - - # this tests if most recent line is not a DATA() statement - if (ref $most_recent ne 'HASH') - { - die - "SHDESCR() does not apply to any catalog ($input_file)"; - } - if (!defined $most_recent->{oid}) - { - die "SHDESCR() does not apply to any oid ($input_file)"; - } - elsif ($1 ne '') - { - $most_recent->{shdescr} = $1; - } - } - elsif (/^DECLARE_TOAST\(\s*(\w+),\s*(\d+),\s*(\d+)\)/) + if (/^DECLARE_TOAST\(\s*(\w+),\s*(\d+),\s*(\d+)\)/) { - $catname = 'toasting'; my ($toast_name, $toast_oid, $index_oid) = ($1, $2, $3); - push @{ $catalog{data} }, + push @{ $catalog{toasting} }, "declare toast $toast_oid $index_oid on $toast_name\n"; } elsif (/^DECLARE_(UNIQUE_)?INDEX\(\s*(\w+),\s*(\d+),\s*(.+)\)/) { - $catname = 'indexing'; my ($is_unique, $index_name, $index_oid, $using) = ($1, $2, $3, $4); - push @{ $catalog{data} }, + push @{ $catalog{indexing} }, sprintf( "declare %sindex %s %s %s\n", $is_unique ? 'unique ' : '', @@ -143,16 +85,13 @@ sub Catalogs } elsif (/^BUILD_INDICES/) { - push @{ $catalog{data} }, "build indices\n"; + push @{ $catalog{indexing} }, "build indices\n"; } elsif (/^CATALOG\(([^,]*),(\d+)\)/) { - $catname = $1; + $catalog{catname} = $1; $catalog{relation_oid} = $2; - # Store pg_* catalog names in the same order we receive them - push @{ $catalogs{names} }, $catname; - $catalog{bootstrap} = /BKI_BOOTSTRAP/ ? ' bootstrap' : ''; $catalog{shared_relation} = /BKI_SHARED_RELATION/ ? ' shared_relation' : ''; @@ -173,7 +112,7 @@ sub Catalogs } if (/^}/) { - undef $declaring_attributes; + $declaring_attributes = 0; } else { @@ -227,32 +166,107 @@ sub Catalogs } } } - $catalogs{$catname} = \%catalog; close $ifh; - } - return \%catalogs; + return \%catalog; } -# Split a DATA line into fields. -# Call this on the bki_values element of a DATA item returned by Catalogs(); -# it returns a list of field values. We don't strip quoting from the fields. -# Note: it should be safe to assign the result to a list of length equal to -# the nominal number of catalog fields, because check_natts already checked -# the number of fields. -sub SplitDataLine +# Parses a file containing Perl data structure literals, returning live data. +# +# The parameter $preserve_formatting needs to be set for callers that want +# to work with non-data lines in the data files, such as comments and blank +# lines. If a caller just wants consume the data, leave it unset. +sub ParseData { - my $bki_values = shift; - - # This handling of quoted strings might look too simplistic, but it - # matches what bootscanner.l does: that has no provision for quote marks - # inside quoted strings, either. If we don't have a quoted string, just - # snarf everything till next whitespace. That will accept some things - # that bootscanner.l will see as erroneous tokens; but it seems wiser - # to do that and let bootscanner.l complain than to silently drop - # non-whitespace characters. - my @result = $bki_values =~ /"[^"]*"|\S+/g; - - return @result; + my ($input_file, $schema, $preserve_formatting) = @_; + + $input_file =~ /\w+\.dat$/ + or die "Input file needs to be a .dat file.\n"; + my $data = []; + + # Read entire file into a string and eval it. + if (!$preserve_formatting) + { + my $file_string = do + { + local $/ = undef; + open my $ifd, "<", $input_file or die "$input_file: $!"; + <$ifd>; + }; + + eval '$data = ' . $file_string; + print "Error : $@\n" if $@; + return $data; + } + + # When preserving formatting, we scan the file one line at a time + # and decide how to handle each item. We don't check too closely + # for valid syntax, since we assume it will be checked otherwise. + my $prev_blank = 0; + open(my $ifd, '<', $input_file) or die "$input_file: $!"; + while (<$ifd>) + { + my $datum; + + # Capture non-consecutive blank lines. + if (/^\s*$/) + { + next if $prev_blank; + $prev_blank = 1; + + # Newline gets added by caller. + $datum = ''; + } + else + { + $prev_blank = 0; + } + + # Capture comments that are on their own line. + if (/^\s*(#.*?)\s*$/) + { + $datum = $1; + } + + # Capture brackets that are on their own line. + elsif (/^\s*(\[|\])\s*$/) + { + $datum = $1; + } + + # Capture hash references + # NB: Assumes that the next hash ref can't start on the + # same line where the present one ended. + # Not foolproof, but we shouldn't need a full parser, + # since we expect relatively well-behaved input. + elsif (/{/) + { + # Quick hack to detect when we have a full hash ref to + # parse. We can't just use a regex because of values in + # pg_aggregate and pg_proc like '{0,0}'. + my $lcnt = tr/{//; + my $rcnt = tr/}//; + + if ($lcnt == $rcnt) + { + eval '$datum = ' . $_; + if (!ref $datum) + { + die "Error parsing $_\n$!"; + } + } + else + { + my $next_line = <$ifd>; + die "$input_file: ends within Perl hash\n" + if !defined $next_line; + $_ .= $next_line; + redo; + } + } + push @$data, $datum if defined $datum; + } + close $ifd; + return $data; } # Fill in default values of a record using the given schema. It's the @@ -308,7 +322,6 @@ sub RenameTempFile rename($temp_name, $final_name) || die "rename: $temp_name: $!"; } - # Find a symbol defined in a particular header file and extract the value. # # The include path has to be passed as a reference to an array. @@ -340,22 +353,4 @@ sub FindDefinedSymbol die "$catalog_header: not found in any include directory\n"; } - -# verify the number of fields in the passed-in DATA line -sub check_natts -{ - my ($catname, $natts, $bki_val, $file, $line) = @_; - - die -"Could not find definition for Natts_${catname} before start of DATA() in $file\n" - unless defined $natts; - - my $nfields = scalar(SplitDataLine($bki_val)); - - die sprintf -"Wrong number of attributes in DATA() entry at %s:%d (expected %d but got %d)\n", - $file, $line, $natts, $nfields - unless $natts == $nfields; -} - 1; diff --git a/src/include/catalog/rewrite_dat.pl b/src/include/catalog/rewrite_dat.pl new file mode 100644 index 0000000..e0d0d66 --- /dev/null +++ b/src/include/catalog/rewrite_dat.pl @@ -0,0 +1,200 @@ +#!/usr/bin/perl -w +#---------------------------------------------------------------------- +# +# rewrite_dat.pl +# Perl script that reads in a catalog data file and writes out +# a functionally equivalent file in a standard format. +# +# -Metadata entries are on their own line within the data entry. +# -Attributes are in the same order they would be in the catalog table. +# +# Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# /src/include/catalog/rewrite_dat.pl +# +#---------------------------------------------------------------------- + +use Catalog; + +use strict; +use warnings; + +my @input_files; +my $output_path = ''; + +# Process command line switches. +while (@ARGV) +{ + my $arg = shift @ARGV; + if ($arg !~ /^-/) + { + push @input_files, $arg; + } + elsif ($arg =~ /^-o/) + { + $output_path = length($arg) > 2 ? substr($arg, 2) : shift @ARGV; + } + elsif ($arg eq '--revert') + { + revert(); + } + else + { + usage(); + } +} + +# Sanity check arguments. +die "No input files.\n" + if !@input_files; + +# Make sure output_path ends in a slash. +if ($output_path ne '' && substr($output_path, -1) ne '/') +{ + $output_path .= '/'; +} + +# Metadata of a catalog entry +my @METADATA = ('oid', 'oid_symbol', 'descr', 'shdescr'); + +# Read all the input files into internal data structures. +# We pass data file names as arguments and then look for matching +# headers to parse the schema from. +foreach my $datfile (@input_files) +{ + $datfile =~ /(.+)\.dat$/ + or die "Input files need to be data (.dat) files.\n"; + + my $header = "$1.h"; + die "There in no header file corresponding to $datfile" + if ! -e $header; + + my @attnames; + my $catalog = Catalog::ParseHeader($header); + my $catname = $catalog->{catname}; + my $schema = $catalog->{columns}; + + foreach my $column (@$schema) + { + my $attname = $column->{name}; + push @attnames, $attname; + } + + my $catalog_data = Catalog::ParseData($datfile, $schema, 1); + next if !defined $catalog_data; + + # Back up old data file rather than overwrite it. The input path and + # output path are normally the same, but we don't assume that. + my $newdatfile = "$output_path$catname.dat"; + if (-e $newdatfile) + { + rename($newdatfile, $newdatfile . '.bak') + or die "rename: $newdatfile: $!"; + } + open my $dat, '>', $newdatfile + or die "can't open $newdatfile: $!"; + + # Write the data. + foreach my $data (@$catalog_data) + { + # Either a newline, comment, or bracket - just write it out. + if (! ref $data) + { + print $dat "$data\n"; + } + # Hash ref representing a data entry. + elsif (ref $data eq 'HASH') + { + my %values = %$data; + print $dat "{ "; + + # Separate out metadata fields for readability. + my $metadata_line = format_line(\%values, @METADATA); + if ($metadata_line) + { + print $dat $metadata_line; + print $dat ",\n"; + } + my $data_line = format_line(\%values, @attnames); + + # Line up with metadata line, if there is one. + if ($metadata_line) + { + print $dat ' '; + } + print $dat $data_line; + print $dat " },\n"; + } + else + { + die "Unexpected data type"; + } + } +} + +# Format the individual elements of a Perl hash into a valid string +# representation. We do this ourselves, rather than use native Perl +# facilities, so we can keep control over the exact formatting of the +# data files. +sub format_line +{ + my $data = shift; + my @atts = @_; + + my $first = 1; + my $value; + my $line = ''; + + foreach my $field (@atts) + { + next if !defined $data->{$field}; + $value = $data->{$field}; + + # Re-escape single quotes. + $value =~ s/'/\\'/g; + + if (!$first) + { + $line .= ', '; + } + $first = 0; + + $line .= "$field => '$value'"; + } + return $line; +} + +# Rename .bak files back to .dat +# This requires passing the .dat files as arguments to the script as normal. +# XXX This is of questionable utility, since the files are under version +# control, after all. +sub revert +{ + foreach my $datfile (@input_files) + { + my $bakfile = "$datfile.bak"; + if (-e $bakfile) + { + rename($bakfile, $datfile) or die "rename: $bakfile: $!"; + } + } + exit 0; +} + +sub usage +{ + die <