Skip to content

Commit

Permalink
[BUG] Introduce parser supplier support in FileSystemDocumentLoader#l…
Browse files Browse the repository at this point in the history
…oadDocuments langchain4j#1026
  • Loading branch information
KaisNeffati committed May 3, 2024
1 parent d28f5ab commit a85a0d5
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 11 deletions.
Expand Up @@ -12,6 +12,7 @@
import org.xml.sax.ContentHandler;

import java.io.InputStream;
import java.util.function.Supplier;

import static dev.langchain4j.internal.Utils.getOrDefault;
import static dev.langchain4j.internal.Utils.isNullOrBlank;
Expand All @@ -25,19 +26,23 @@
public class ApacheTikaDocumentParser implements DocumentParser {

private static final int NO_WRITE_LIMIT = -1;
public static final Supplier<Parser> DEFAULT_PARSER_SUPPLIER = AutoDetectParser::new;
public static final Supplier<Metadata> DEFAULT_METADATA_SUPPLIER = Metadata::new;
public static final Supplier<ParseContext> DEFAULT_PARSE_CONTEXT_SUPPLIER = ParseContext::new;
public static final Supplier<ContentHandler> DEFAULT_CONTENT_HANDLER_SUPPLIER = () -> new BodyContentHandler(NO_WRITE_LIMIT);

private final Parser parser;
private final ContentHandler contentHandler;
private final Metadata metadata;
private final ParseContext parseContext;
private final Supplier<Parser> parserSupplier;
private final Supplier<ContentHandler> contentHandlerSupplier;
private final Supplier<Metadata> metadataSupplier;
private final Supplier<ParseContext> parseContextSupplier;

/**
* Creates an instance of an {@code ApacheTikaDocumentParser} with the default Tika components.
* It uses {@link AutoDetectParser}, {@link BodyContentHandler} without write limit,
* empty {@link Metadata} and empty {@link ParseContext}.
*/
public ApacheTikaDocumentParser() {
this(null, null, null, null);
this((Supplier<Parser>) null, null, null, null);
}

/**
Expand All @@ -48,22 +53,45 @@ public ApacheTikaDocumentParser() {
* @param contentHandler Tika content handler. Default: {@link BodyContentHandler} without write limit
* @param metadata Tika metadata. Default: empty {@link Metadata}
* @param parseContext Tika parse context. Default: empty {@link ParseContext}
* @deprecated Use the constructor with suppliers for Tika components if you intend to use this parser for multiple files.
*/
@Deprecated
public ApacheTikaDocumentParser(Parser parser,
ContentHandler contentHandler,
Metadata metadata,
ParseContext parseContext) {
this.parser = getOrDefault(parser, AutoDetectParser::new);
this.contentHandler = getOrDefault(contentHandler, () -> new BodyContentHandler(NO_WRITE_LIMIT));
this.metadata = getOrDefault(metadata, Metadata::new);
this.parseContext = getOrDefault(parseContext, ParseContext::new);
this(() -> parser, () -> contentHandler, () -> metadata, () -> parseContext);
}

/**
* Creates an instance of an {@code ApacheTikaDocumentParser} with the provided suppliers for Tika components.
* If some of the suppliers are not provided ({@code null}), the defaults will be used.
*
* @param parserSupplier Supplier for Tika parser to use. Default: {@link AutoDetectParser}
* @param contentHandlerSupplier Supplier for Tika content handler. Default: {@link BodyContentHandler} without write limit
* @param metadataSupplier Supplier for Tika metadata. Default: empty {@link Metadata}
* @param parseContextSupplier Supplier for Tika parse context. Default: empty {@link ParseContext}
*/
public ApacheTikaDocumentParser(Supplier<Parser> parserSupplier,
Supplier<ContentHandler> contentHandlerSupplier,
Supplier<Metadata> metadataSupplier,
Supplier<ParseContext> parseContextSupplier) {
this.parserSupplier = parserSupplier;
this.contentHandlerSupplier = contentHandlerSupplier;
this.metadataSupplier = metadataSupplier;
this.parseContextSupplier = parseContextSupplier;
}

// TODO allow automatically extract metadata (e.g. creator, last-author, created/modified timestamp, etc)

@Override
public Document parse(InputStream inputStream) {
try {
Parser parser = getOrDefault(parserSupplier, () -> DEFAULT_PARSER_SUPPLIER).get();
ContentHandler contentHandler = getOrDefault(contentHandlerSupplier, () -> DEFAULT_CONTENT_HANDLER_SUPPLIER).get();
Metadata metadata = getOrDefault(metadataSupplier, () -> DEFAULT_METADATA_SUPPLIER).get();
ParseContext parseContext = getOrDefault(parseContextSupplier, () -> DEFAULT_PARSE_CONTEXT_SUPPLIER).get();

parser.parse(inputStream, contentHandler, metadata, parseContext);
String text = contentHandler.toString();

Expand Down
@@ -1,9 +1,10 @@
package dev.langchain4j.data.document.parser.apache.tika;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.tika.parser.AutoDetectParser;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

Expand Down Expand Up @@ -40,7 +41,7 @@ void should_parse_doc_ppt_and_pdf_files(String fileName) {
})
void should_parse_xls_files(String fileName) {

DocumentParser parser = new ApacheTikaDocumentParser(new AutoDetectParser(), null, null, null);
DocumentParser parser = new ApacheTikaDocumentParser(AutoDetectParser::new, null, null, null);
InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName);

Document document = parser.parse(inputStream);
Expand All @@ -50,6 +51,24 @@ void should_parse_xls_files(String fileName) {
assertThat(document.metadata().asMap()).isEmpty();
}

@Test
void should_parse_files_stateless() {

DocumentParser parser = new ApacheTikaDocumentParser();
InputStream inputStream1 = getClass().getClassLoader().getResourceAsStream("test-file.xls");
InputStream inputStream2 = getClass().getClassLoader().getResourceAsStream("test-file.xls");

Document document1 = parser.parse(inputStream1);
Document document2 = parser.parse(inputStream2);

assertThat(document1.text())
.isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content");
assertThat(document2.text())
.isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content");
assertThat(document1.metadata().asMap()).isEmpty();
assertThat(document2.metadata().asMap()).isEmpty();
}

@ParameterizedTest
@ValueSource(strings = {
"empty-file.txt",
Expand Down

0 comments on commit a85a0d5

Please sign in to comment.