XmlCompressor.java

/*
 *    Copyright 2009-2022 the original author or authors.
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */
package com.googlecode.htmlcompressor.compressor;

import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Class that compresses given XML source by removing comments, extra spaces and line breaks while preserving content
 * within CDATA blocks.
 *
 * @author <a href="mailto:serg472@gmail.com">Sergiy Kovalchuk</a>
 */
public class XmlCompressor implements Compressor {

    /** The enabled. */
    private boolean enabled = true;

    /** The remove comments. */
    // default settings
    private boolean removeComments = true;

    /** The remove intertag spaces. */
    private boolean removeIntertagSpaces = true;

    /** The Constant tempCdataBlock. */
    // temp replacements for preserved blocks
    protected static final String TEMP_CD_DATA_BLOCK = "%%%COMPRESS~CDATA~{0,number,#}%%%";

    /** The Constant cdataPattern. */
    // compiled regex patterns
    protected static final Pattern cdataPattern = Pattern.compile("<!\\[CDATA\\[.*?\\]\\]>",
            Pattern.DOTALL | Pattern.CASE_INSENSITIVE);

    /** The Constant commentPattern. */
    protected static final Pattern commentPattern = Pattern.compile("<!--.*?-->",
            Pattern.DOTALL | Pattern.CASE_INSENSITIVE);

    /** The Constant intertagPattern. */
    protected static final Pattern intertagPattern = Pattern.compile(">\\s+<",
            Pattern.DOTALL | Pattern.CASE_INSENSITIVE);

    /** The Constant tagEndSpacePattern. */
    protected static final Pattern tagEndSpacePattern = Pattern.compile("(<(?:[^>]+?))(?:\\s+?)(/?>)",
            Pattern.DOTALL | Pattern.CASE_INSENSITIVE);

    /** The Constant multispacePattern. */
    protected static final Pattern multispacePattern = Pattern.compile("\\s+(?=[^<]*?>)",
            Pattern.DOTALL | Pattern.CASE_INSENSITIVE);

    /** The Constant tagPropertyPattern. */
    protected static final Pattern tagPropertyPattern = Pattern.compile("(\\s\\w+)\\s*=\\s*(?=[^<]*?>)",
            Pattern.CASE_INSENSITIVE);

    /** The Constant tempCdataPattern. */
    protected static final Pattern tempCdataPattern = Pattern.compile("%%%COMPRESS~CDATA~(\\d+?)%%%",
            Pattern.DOTALL | Pattern.CASE_INSENSITIVE);

    /**
     * The main method that compresses given XML source and returns compressed result.
     *
     * @param xml
     *            XML content to compress
     *
     * @return compressed content.
     */
    @Override
    public String compress(String xml) {
        if (!enabled || xml == null || xml.length() == 0) {
            return xml;
        }

        // preserved block containers
        List<String> cdataBlocks = new ArrayList<>();

        // preserve blocks
        xml = preserveBlocks(xml, cdataBlocks);

        // process pure xml
        xml = processXml(xml);

        // return preserved blocks
        xml = returnBlocks(xml, cdataBlocks);

        return xml.trim();
    }

    /**
     * Preserve blocks.
     *
     * @param xml
     *            the xml
     * @param cdataBlocks
     *            the cdata blocks
     *
     * @return the string
     */
    protected String preserveBlocks(String xml, List<String> cdataBlocks) {
        // preserve CDATA blocks
        Matcher matcher = cdataPattern.matcher(xml);
        int index = 0;
        StringBuilder sb = new StringBuilder();
        while (matcher.find()) {
            cdataBlocks.add(matcher.group(0));
            matcher.appendReplacement(sb, MessageFormat.format(TEMP_CD_DATA_BLOCK, index++));
        }
        matcher.appendTail(sb);
        xml = sb.toString();

        return xml;
    }

    /**
     * Return blocks.
     *
     * @param xml
     *            the xml
     * @param cdataBlocks
     *            the cdata blocks
     *
     * @return the string
     */
    protected String returnBlocks(String xml, List<String> cdataBlocks) {
        // put CDATA blocks back
        Matcher matcher = tempCdataPattern.matcher(xml);
        StringBuilder sb = new StringBuilder();
        while (matcher.find()) {
            matcher.appendReplacement(sb,
                    Matcher.quoteReplacement(cdataBlocks.get(Integer.parseInt(matcher.group(1)))));
        }
        matcher.appendTail(sb);
        xml = sb.toString();

        return xml;
    }

    /**
     * Process xml.
     *
     * @param xml
     *            the xml
     *
     * @return the string
     */
    protected String processXml(String xml) {
        // remove comments
        xml = removeComments(xml);

        // remove inter-tag spaces
        xml = removeIntertagSpaces(xml);

        // remove unneeded spaces inside tags
        xml = removeSpacesInsideTags(xml);

        return xml;
    }

    /**
     * Removes the spaces inside tags.
     *
     * @param xml
     *            the xml
     *
     * @return the string
     */
    protected String removeSpacesInsideTags(String xml) {
        // replace miltiple spaces inside tags with single spaces
        xml = multispacePattern.matcher(xml).replaceAll(" ");

        // remove spaces around equal sign inside tags
        xml = tagPropertyPattern.matcher(xml).replaceAll("$1=");

        // remove ending spaces inside tags
        xml = tagEndSpacePattern.matcher(xml).replaceAll("$1$2");
        return xml;
    }

    /**
     * Removes the intertag spaces.
     *
     * @param xml
     *            the xml
     *
     * @return the string
     */
    protected String removeIntertagSpaces(String xml) {
        // remove inter-tag spaces
        if (removeIntertagSpaces) {
            xml = intertagPattern.matcher(xml).replaceAll("><");
        }
        return xml;
    }

    /**
     * Removes the comments.
     *
     * @param xml
     *            the xml
     *
     * @return the string
     */
    protected String removeComments(String xml) {
        // remove comments
        if (removeComments) {
            xml = commentPattern.matcher(xml).replaceAll("");
        }
        return xml;
    }

    /**
     * Returns <code>true</code> if compression is enabled.
     *
     * @return <code>true</code> if compression is enabled.
     */
    public boolean isEnabled() {
        return enabled;
    }

    /**
     * If set to <code>false</code> all compression will be bypassed. Might be useful for testing purposes. Default is
     * <code>true</code>.
     *
     * @param enabled
     *            set <code>false</code> to bypass all compression
     */
    public void setEnabled(boolean enabled) {
        this.enabled = enabled;
    }

    /**
     * Returns <code>true</code> if all XML comments will be removed.
     *
     * @return <code>true</code> if all XML comments will be removed
     */
    public boolean isRemoveComments() {
        return removeComments;
    }

    /**
     * If set to <code>true</code> all XML comments will be removed. Default is <code>true</code>.
     *
     * @param removeComments
     *            set <code>true</code> to remove all XML comments
     */
    public void setRemoveComments(boolean removeComments) {
        this.removeComments = removeComments;
    }

    /**
     * Returns <code>true</code> if all inter-tag whitespace characters will be removed.
     *
     * @return <code>true</code> if all inter-tag whitespace characters will be removed.
     */
    public boolean isRemoveIntertagSpaces() {
        return removeIntertagSpaces;
    }

    /**
     * If set to <code>true</code> all inter-tag whitespace characters will be removed. Default is <code>true</code>.
     *
     * @param removeIntertagSpaces
     *            set <code>true</code> to remove all inter-tag whitespace characters
     */
    public void setRemoveIntertagSpaces(boolean removeIntertagSpaces) {
        this.removeIntertagSpaces = removeIntertagSpaces;
    }

}