scripts/merge_xml.awk


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

# The script keeps track of some special situations:
# - 'tags' in comments are not handled well by poxml tools, so these
#   are removed
# - references within comments should not be processed, so we keep
#   a count of opening and closing of comments

BEGIN {
    main_count = 1
    
    # Let's first build an array with all the entities (xml files)
    while (getline <ENTLIST) {
        delim = index($0, ":")
        i = substr($0, 1, delim - 1)
        
        fname = substr($0, delim + 1, length($0) - delim)
        # Trim any leading and trailing space of filenames
        gsub(/^[[:space:]]*/, "", fname)
        gsub(/[[:space:]]*$/, "", fname)

        ent [i] = fname
        included [i] = 0
    }
}

{
    # In the main loop we only want to process entities that are refered to
    line = $0
    if (match (line, /^[[:space:]]*&.*\.xml;[[:space:]]*(<\!--.*-->[[:space:]]*|)*$/) > 0) {
        process_file(line, "main")
    }
}

END {
    print "" >>LOG
    print "The following defined entities (from docstruct) were NOT processed:" >>LOG
    for (entname in ent) {
        if (included [entname] == 0) {
            print "  " entname >>LOG
        }
    }
}

function process_file(entline, level,   fname, tfname) {
        entname = get_entname(entline)
        if (entname in ent) {
            fname = ent [entname]
            print "Processing: " fname >>LOG
            INFILE = WORKDIR "/in/" fname

            if (level == "main") {
                main_count += 1

                # Change at highest level: change to a new output file
                OUTFILE = WORKDIR "/out/" fname
                OUTDIR = OUTFILE
                gsub(/\/[^\/]*$/, "/", OUTDIR) # strip filename
                system("mkdir -p " OUTDIR)     # create directory
            } else {
                print "" >>OUTFILE
            }

            if (level == "sub" && included [entname] != 0 && included [entname] < main_count) {
                print "** Warning: entity '" entname "'was also included in another file." >>LOG
            }
            if (level == "main") {
                included [entname] = 1
            } else {
                included [entname] = main_count
            }
            parse_file(INFILE, fname)

        } else {
            print "** Entity " entname " not found and will be skipped!" >>LOG
            print entline >>OUTFILE
        }
}

function parse_file(PARSEFILE, FNAME,   fname, nwline, comment_count) {
    comment_count = 0
    fname = FNAME
    
    # Test whether file exists
    if ( ( getline <PARSEFILE ) <= 0 ) {
        print "** Error: file '" PARSEFILE "' does not exist!" >>LOG
        return
    }
    
    print "<!-- Start of file " fname " -->" >>OUTFILE
    while (getline <PARSEFILE) {
        nwline = $0

        # Update the count of 'open' comments
        comment_count += count_comments(nwline)

        if (match(nwline, /^[[:space:]]*&.*\.xml;[[:space:]]*(<\!--.*-->[[:space:]]*|)*$/) > 0) {
            # If we find another entity reference, we process that file recursively
            # But not if the reference is within a comment
            if (comment_count != 0) {
                print "** Skipping entity reference '" nwline "' found in comment!" >>LOG
            } else {
                process_file(nwline, "sub")
            }
        } else {
            # Else we just print the line
            if (match(nwline, /<\!--.*<.*>.*<.*>.*-->/) > 0) {
                # Comments containing "<...> ... <...>" are not handled correctly
                # by xml2pot and split2po, so we skip lines like that
                # Note: this is a workaround for a bug in the tools:
                #       http://bugs.kde.org/show_bug.cgi?id=90294
                print "** Comment deleted in line '" nwline "'" >>LOG
                gsub(/<\!--.*<.*>.*<.*>.*-->/, "", nwline)
            }
            print nwline >>OUTFILE
        }
    }
    if (comment_count != 0) {
        print "** Comment count is not zero at end of file: " comment_count >>LOG
    }
    print "<!--   End of file " fname " -->" >>OUTFILE
    close(PARSEFILE)
}

function get_entname(entline,   ename) {
    # Parse the name of the entity out of the entity reference
    ename = entline
    gsub(/^[[:space:]]*&/, "", ename)
    gsub(/;.*$/, "", ename)
    return ename
}

function count_comments(inline,   tmpline, count) {
    # 'abuse' gsub to count them
    tmpline = inline
    count += gsub(/<\!--/, "", tmpline)
    count -= gsub(/-->/, "", tmpline)
    return count
}