/*
 * Decompiled with CFR 0.152.
 */
package ro.sync.textsearch.pdf;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexableField;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.text.PDFTextStripper;
import org.xml.sax.InputSource;
import ro.sync.textsearch.DocumentCreator;
import ro.sync.textsearch.fields.StoredTextField;
import ro.sync.textsearch.fields.TextWithTermVectorsAndPositionsField;
import ro.sync.textsearch.util.Normalizer;
import ro.sync.textsearch.util.TrimUtil;
import ro.sync.textsearch.webhelp.converter.IContentConverter;
import ro.sync.textsearch.webhelp.embeddings.IEmbeddingProvider;

public class PDFDocumentCreator
implements DocumentCreator {
    private final int descLen;
    private Normalizer normalizer;

    public PDFDocumentCreator(int descLen) {
        this.descLen = descLen;
        this.normalizer = new Normalizer(descLen);
    }

    @Override
    public Document[] createDocuments(InputSource source, long contentLength, List<String> tagsAndClasesToIgnore, IContentConverter contentConverter, IEmbeddingProvider embeddingsProvider, boolean generateCompletionFields) throws IOException {
        InputStream stream = source.getByteStream();
        if (stream != null) {
            try (PDDocument pdDoc = PDDocument.load((InputStream)stream);){
                PDDocumentInformation documentInformation = pdDoc.getDocumentInformation();
                Document doc = new Document();
                StringWriter writer = new StringWriter();
                String subject = documentInformation.getSubject();
                if (subject != null) {
                    writer.write(subject);
                    writer.write(32);
                    doc.add((IndexableField)new StringField("__desc__", TrimUtil.extractDescription(this.normalizer, subject, this.descLen), Field.Store.YES));
                }
                PDFTextStripper pdfStripper = new PDFTextStripper();
                pdfStripper.setLineSeparator("\n");
                pdfStripper.setStartPage(1);
                pdfStripper.setEndPage(pdDoc.getNumberOfPages());
                pdfStripper.writeText(pdDoc, (Writer)writer);
                String title = documentInformation.getTitle();
                if (title != null) {
                    writer.write(title);
                    writer.write(32);
                    doc.add((IndexableField)new StoredTextField("__xml_title__", TrimUtil.extractDescription(this.normalizer, title, 128)));
                }
                if (documentInformation.getKeywords() != null) {
                    writer.write(documentInformation.getKeywords());
                    writer.write(32);
                }
                String content = writer.toString();
                doc.add((IndexableField)new StoredTextField("__text__", content));
                if (generateCompletionFields) {
                    doc.add((IndexableField)new TextWithTermVectorsAndPositionsField("__completion_terms__", content));
                }
                if (subject == null) {
                    doc.add((IndexableField)new StringField("__desc__", TrimUtil.extractDescription(this.normalizer, content, this.descLen), Field.Store.YES));
                }
                Document[] documentArray = new Document[]{doc};
                return documentArray;
            }
        }
        throw new IOException("The PDF source has no binary data");
    }
}

