/*
 * Decompiled with CFR 0.152.
 */
package ro.sync.textsearch.html;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.util.List;
import org.apache.lucene.document.Document;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.SimpleXmlSerializer;
import org.htmlcleaner.TagNode;
import org.xml.sax.InputSource;
import ro.sync.textsearch.DocumentCreator;
import ro.sync.textsearch.webhelp.converter.IContentConverter;
import ro.sync.textsearch.webhelp.embeddings.IEmbeddingProvider;
import ro.sync.textsearch.xml.XMLDocumentCreator;
import ro.sync.textsearch.xml.XMLOptions;

public class HTMLDocumentCreator
extends XMLDocumentCreator {
    private CleanerProperties cleanerProps = new CleanerProperties();

    public HTMLDocumentCreator(XMLOptions xmlOptions) {
        this(xmlOptions, false, false, -1);
    }

    public HTMLDocumentCreator(XMLOptions xmlOptions, boolean isSkimmingMode, boolean bindAllElementsToFields, int xmlAwareContentSizeLimit) {
        super(xmlOptions, isSkimmingMode, bindAllElementsToFields, xmlAwareContentSizeLimit);
        this.cleanerProps.setTransResCharsToNCR(true);
        this.cleanerProps.setKeepWhitespaceAndCommentsInHead(true);
        this.cleanerProps.setAdvancedXmlEscape(true);
        this.cleanerProps.setRecognizeUnicodeChars(false);
        this.cleanerProps.setOmitDoctypeDeclaration(false);
        this.cleanerProps.setIgnoreQuestAndExclam(false);
    }

    @Override
    public Document[] createDocuments(InputSource source, long contentLenght, List<String> tagsAndClassesToIgnore, IContentConverter contentConverter, IEmbeddingProvider embeddingsProvider, boolean generateCompletionFields) throws IOException, DocumentCreator.ParseException {
        return super.createDocuments(this.makeWellFormed(source), contentLenght, tagsAndClassesToIgnore, contentConverter, embeddingsProvider, generateCompletionFields);
    }

    protected InputSource makeWellFormed(InputSource source) throws IOException {
        TagNode tagNode;
        HtmlCleaner htmlCleaner = new HtmlCleaner(this.cleanerProps);
        if (source.getByteStream() != null) {
            tagNode = htmlCleaner.clean(source.getByteStream());
        } else if (source.getCharacterStream() != null) {
            tagNode = htmlCleaner.clean(source.getCharacterStream());
        } else {
            throw new IOException("Incomplete source, no reader or input stream for " + source.getSystemId());
        }
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        new SimpleXmlSerializer(this.cleanerProps).writeToStream(tagNode, (OutputStream)baos, StandardCharsets.UTF_8.name());
        baos.flush();
        InputSource wfInputSource = new InputSource(source.getSystemId());
        wfInputSource.setByteStream(new ByteArrayInputStream(baos.toByteArray()));
        return wfInputSource;
    }
}

