summaryrefslogtreecommitdiff
path: root/scripts/merge_xml.awk
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/merge_xml.awk')
-rw-r--r--scripts/merge_xml.awk138
1 files changed, 138 insertions, 0 deletions
diff --git a/scripts/merge_xml.awk b/scripts/merge_xml.awk
new file mode 100644
index 000000000..b0f590690
--- /dev/null
+++ b/scripts/merge_xml.awk
@@ -0,0 +1,138 @@
+# The script keeps track of some special situations:
+# - 'tags' in comments are not handled well by poxml tools, so these
+# are removed
+# - references within comments should not be processed, so we keep
+# a count of opening and closing of comments
+
+BEGIN {
+ main_count = 1
+
+ # Let's first build an array with all the entities (xml files)
+ while (getline <ENTLIST) {
+ delim = index($0, ":")
+ i = substr($0, 1, delim - 1)
+
+ fname = substr($0, delim + 1, length($0) - delim)
+ # Trim any leading and trailing space of filenames
+ gsub(/^[[:space:]]*/, "", fname)
+ gsub(/[[:space:]]*$/, "", fname)
+
+ ent [i] = fname
+ included [i] = 0
+ }
+}
+
+{
+ # In the main loop we only want to process entities that are refered to
+ line = $0
+ if (match (line, /^[[:space:]]*&.*\.xml;[[:space:]]*(<\!--.*-->[[:space:]]*|)*$/) > 0) {
+ process_file(line, "main")
+ }
+}
+
+END {
+ print "" >>LOG
+ print "The following defined entities (from docstruct) were NOT processed:" >>LOG
+ for (entname in ent) {
+ if (included [entname] == 0) {
+ print " " entname >>LOG
+ }
+ }
+}
+
+function process_file(entline, level, fname, tfname) {
+ entname = get_entname(entline)
+ if (entname in ent) {
+ fname = ent [entname]
+ print "Processing: " fname >>LOG
+ INFILE = WORKDIR "/in/" fname
+
+ if (level == "main") {
+ main_count += 1
+
+ # Change at highest level: change to a new output file
+ OUTFILE = WORKDIR "/out/" fname
+ OUTDIR = OUTFILE
+ gsub(/\/[^\/]*$/, "/", OUTDIR) # strip filename
+ system("mkdir -p " OUTDIR) # create directory
+ } else {
+ print "" >>OUTFILE
+ }
+
+ if (level == "sub" && included [entname] != 0 && included [entname] < main_count) {
+ print "** Warning: entity '" entname "'was also included in another file." >>LOG
+ }
+ if (level == "main") {
+ included [entname] = 1
+ } else {
+ included [entname] = main_count
+ }
+ parse_file(INFILE, fname)
+
+ } else {
+ print "** Entity " entname " not found and will be skipped!" >>LOG
+ print entline >>OUTFILE
+ }
+}
+
+function parse_file(PARSEFILE, FNAME, fname, nwline, comment_count) {
+ comment_count = 0
+ fname = FNAME
+
+ # Test whether file exists
+ getline <PARSEFILE
+ if (ERRNO != 0) {
+ print "** Error: file '" PARSEFILE "' does not exist!" >>LOG
+ return
+ }
+
+ print "<!-- Start of file " fname " -->" >>OUTFILE
+ while (getline <PARSEFILE) {
+ nwline = $0
+
+ # Update the count of 'open' comments
+ comment_count += count_comments(nwline)
+
+ if (match(nwline, /^[[:space:]]*&.*\.xml;[[:space:]]*(<\!--.*-->[[:space:]]*|)*$/) > 0) {
+ # If we find another entity reference, we process that file recursively
+ # But not if the reference is within a comment
+ if (comment_count != 0) {
+ print "** Skipping entity reference '" nwline "' found in comment!" >>LOG
+ } else {
+ process_file(nwline, "sub")
+ }
+ } else {
+ # Else we just print the line
+ if (match(nwline, /<\!--.*<.*>.*<.*>.*-->/) > 0) {
+ # Comments containing "<...> ... <...>" are not handled correctly
+ # by xml2pot and split2po, so we skip lines like that
+ # Note: this is a workaround for a bug in the tools:
+ # http://bugs.kde.org/show_bug.cgi?id=90294
+ print "** Comment deleted in line '" nwline "'" >>LOG
+ gsub(/<\!--.*<.*>.*<.*>.*-->/, "", nwline)
+ }
+ print nwline >>OUTFILE
+ }
+ }
+ if (comment_count != 0) {
+ print "** Comment count is not zero at end of file: " comment_count >>LOG
+ }
+ print "<!-- End of file " fname " -->" >>OUTFILE
+ close(PARSEFILE)
+}
+
+function get_entname(entline, ename) {
+ # Parse the name of the entity out of the entity reference
+ ename = entline
+ gsub(/^[[:space:]]*&/, "", ename)
+ gsub(/;.*$/, "", ename)
+ return ename
+}
+
+function count_comments(inline, tmpline, count) {
+ # 'abuse' gsub to count them
+ tmpline = inline
+ count += gsub(/<\!--/, "", tmpline)
+ count -= gsub(/-->/, "", tmpline)
+ return count
+}