diff options
Diffstat (limited to 'scripts/merge_xml.awk')
-rw-r--r-- | scripts/merge_xml.awk | 138 |
1 files changed, 138 insertions, 0 deletions
diff --git a/scripts/merge_xml.awk b/scripts/merge_xml.awk new file mode 100644 index 000000000..b0f590690 --- /dev/null +++ b/scripts/merge_xml.awk @@ -0,0 +1,138 @@ +# The script keeps track of some special situations: +# - 'tags' in comments are not handled well by poxml tools, so these +# are removed +# - references within comments should not be processed, so we keep +# a count of opening and closing of comments + +BEGIN { + main_count = 1 + + # Let's first build an array with all the entities (xml files) + while (getline <ENTLIST) { + delim = index($0, ":") + i = substr($0, 1, delim - 1) + + fname = substr($0, delim + 1, length($0) - delim) + # Trim any leading and trailing space of filenames + gsub(/^[[:space:]]*/, "", fname) + gsub(/[[:space:]]*$/, "", fname) + + ent [i] = fname + included [i] = 0 + } +} + +{ + # In the main loop we only want to process entities that are refered to + line = $0 + if (match (line, /^[[:space:]]*&.*\.xml;[[:space:]]*(<\!--.*-->[[:space:]]*|)*$/) > 0) { + process_file(line, "main") + } +} + +END { + print "" >>LOG + print "The following defined entities (from docstruct) were NOT processed:" >>LOG + for (entname in ent) { + if (included [entname] == 0) { + print " " entname >>LOG + } + } +} + +function process_file(entline, level, fname, tfname) { + entname = get_entname(entline) + if (entname in ent) { + fname = ent [entname] + print "Processing: " fname >>LOG + INFILE = WORKDIR "/in/" fname + + if (level == "main") { + main_count += 1 + + # Change at highest level: change to a new output file + OUTFILE = WORKDIR "/out/" fname + OUTDIR = OUTFILE + gsub(/\/[^\/]*$/, "/", OUTDIR) # strip filename + system("mkdir -p " OUTDIR) # create directory + } else { + print "" >>OUTFILE + } + + if (level == "sub" && included [entname] != 0 && included [entname] < main_count) { + print "** Warning: entity '" entname "'was also included in another file." >>LOG + } + if (level == "main") { + included [entname] = 1 + } else { + included [entname] = main_count + } + parse_file(INFILE, fname) + + } else { + print "** Entity " entname " not found and will be skipped!" >>LOG + print entline >>OUTFILE + } +} + +function parse_file(PARSEFILE, FNAME, fname, nwline, comment_count) { + comment_count = 0 + fname = FNAME + + # Test whether file exists + getline <PARSEFILE + if (ERRNO != 0) { + print "** Error: file '" PARSEFILE "' does not exist!" >>LOG + return + } + + print "<!-- Start of file " fname " -->" >>OUTFILE + while (getline <PARSEFILE) { + nwline = $0 + + # Update the count of 'open' comments + comment_count += count_comments(nwline) + + if (match(nwline, /^[[:space:]]*&.*\.xml;[[:space:]]*(<\!--.*-->[[:space:]]*|)*$/) > 0) { + # If we find another entity reference, we process that file recursively + # But not if the reference is within a comment + if (comment_count != 0) { + print "** Skipping entity reference '" nwline "' found in comment!" >>LOG + } else { + process_file(nwline, "sub") + } + } else { + # Else we just print the line + if (match(nwline, /<\!--.*<.*>.*<.*>.*-->/) > 0) { + # Comments containing "<...> ... <...>" are not handled correctly + # by xml2pot and split2po, so we skip lines like that + # Note: this is a workaround for a bug in the tools: + # http://bugs.kde.org/show_bug.cgi?id=90294 + print "** Comment deleted in line '" nwline "'" >>LOG + gsub(/<\!--.*<.*>.*<.*>.*-->/, "", nwline) + } + print nwline >>OUTFILE + } + } + if (comment_count != 0) { + print "** Comment count is not zero at end of file: " comment_count >>LOG + } + print "<!-- End of file " fname " -->" >>OUTFILE + close(PARSEFILE) +} + +function get_entname(entline, ename) { + # Parse the name of the entity out of the entity reference + ename = entline + gsub(/^[[:space:]]*&/, "", ename) + gsub(/;.*$/, "", ename) + return ename +} + +function count_comments(inline, tmpline, count) { + # 'abuse' gsub to count them + tmpline = inline + count += gsub(/<\!--/, "", tmpline) + count -= gsub(/-->/, "", tmpline) + return count +} |