Skip to content

Commit

Permalink
[BUG] Introduce parser supplier support in FileSystemDocumentLoader (#…
Browse files Browse the repository at this point in the history
…1031)

## Issue
#1026


## General checklist
<!-- Please double-check the following points and mark them like this:
[X] -->
- [X] There are no breaking changes
- [X] I have added unit and integration tests for my change
- [X] I have manually run all the unit and integration tests in the
module I have added/changed, and they are all green
- [X] I have manually run all the unit and integration tests in the
[core](https://github.com/langchain4j/langchain4j/tree/main/langchain4j-core)
and
[main](https://github.com/langchain4j/langchain4j/tree/main/langchain4j)
modules, and they are all green
- [X] I have added/updated the
[documentation](https://github.com/langchain4j/langchain4j/tree/main/docs/docs)
- [ ] I have added an example in the [examples
repo](https://github.com/langchain4j/langchain4j-examples) (only for
"big" features)
  • Loading branch information
KaisNeffati committed May 6, 2024
1 parent d28f5ab commit f34c543
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 12 deletions.
@@ -1,7 +1,7 @@
package dev.langchain4j.data.document.parser.apache.tika;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.metadata.Metadata;
Expand All @@ -12,6 +12,7 @@
import org.xml.sax.ContentHandler;

import java.io.InputStream;
import java.util.function.Supplier;

import static dev.langchain4j.internal.Utils.getOrDefault;
import static dev.langchain4j.internal.Utils.isNullOrBlank;
Expand All @@ -25,19 +26,23 @@
public class ApacheTikaDocumentParser implements DocumentParser {

private static final int NO_WRITE_LIMIT = -1;
public static final Supplier<Parser> DEFAULT_PARSER_SUPPLIER = AutoDetectParser::new;
public static final Supplier<Metadata> DEFAULT_METADATA_SUPPLIER = Metadata::new;
public static final Supplier<ParseContext> DEFAULT_PARSE_CONTEXT_SUPPLIER = ParseContext::new;
public static final Supplier<ContentHandler> DEFAULT_CONTENT_HANDLER_SUPPLIER = () -> new BodyContentHandler(NO_WRITE_LIMIT);

private final Parser parser;
private final ContentHandler contentHandler;
private final Metadata metadata;
private final ParseContext parseContext;
private final Supplier<Parser> parserSupplier;
private final Supplier<ContentHandler> contentHandlerSupplier;
private final Supplier<Metadata> metadataSupplier;
private final Supplier<ParseContext> parseContextSupplier;

/**
* Creates an instance of an {@code ApacheTikaDocumentParser} with the default Tika components.
* It uses {@link AutoDetectParser}, {@link BodyContentHandler} without write limit,
* empty {@link Metadata} and empty {@link ParseContext}.
*/
public ApacheTikaDocumentParser() {
this(null, null, null, null);
this((Supplier<Parser>) null, null, null, null);
}

/**
Expand All @@ -48,22 +53,50 @@ public ApacheTikaDocumentParser() {
* @param contentHandler Tika content handler. Default: {@link BodyContentHandler} without write limit
* @param metadata Tika metadata. Default: empty {@link Metadata}
* @param parseContext Tika parse context. Default: empty {@link ParseContext}
* @deprecated Use the constructor with suppliers for Tika components if you intend to use this parser for multiple files.
*/
@Deprecated
public ApacheTikaDocumentParser(Parser parser,
ContentHandler contentHandler,
Metadata metadata,
ParseContext parseContext) {
this.parser = getOrDefault(parser, AutoDetectParser::new);
this.contentHandler = getOrDefault(contentHandler, () -> new BodyContentHandler(NO_WRITE_LIMIT));
this.metadata = getOrDefault(metadata, Metadata::new);
this.parseContext = getOrDefault(parseContext, ParseContext::new);
this(
() -> getOrDefault(parser, DEFAULT_PARSER_SUPPLIER),
() -> getOrDefault(contentHandler, DEFAULT_CONTENT_HANDLER_SUPPLIER),
() -> getOrDefault(metadata, DEFAULT_METADATA_SUPPLIER),
() -> getOrDefault(parseContext, DEFAULT_PARSE_CONTEXT_SUPPLIER)
);
}

/**
* Creates an instance of an {@code ApacheTikaDocumentParser} with the provided suppliers for Tika components.
* If some of the suppliers are not provided ({@code null}), the defaults will be used.
*
* @param parserSupplier Supplier for Tika parser to use. Default: {@link AutoDetectParser}
* @param contentHandlerSupplier Supplier for Tika content handler. Default: {@link BodyContentHandler} without write limit
* @param metadataSupplier Supplier for Tika metadata. Default: empty {@link Metadata}
* @param parseContextSupplier Supplier for Tika parse context. Default: empty {@link ParseContext}
*/
public ApacheTikaDocumentParser(Supplier<Parser> parserSupplier,
Supplier<ContentHandler> contentHandlerSupplier,
Supplier<Metadata> metadataSupplier,
Supplier<ParseContext> parseContextSupplier) {
this.parserSupplier = getOrDefault(parserSupplier, () -> DEFAULT_PARSER_SUPPLIER);
this.contentHandlerSupplier = getOrDefault(contentHandlerSupplier, () -> DEFAULT_CONTENT_HANDLER_SUPPLIER);
this.metadataSupplier = getOrDefault(metadataSupplier, () -> DEFAULT_METADATA_SUPPLIER);
this.parseContextSupplier = getOrDefault(parseContextSupplier, () -> DEFAULT_PARSE_CONTEXT_SUPPLIER);
}

// TODO allow automatically extract metadata (e.g. creator, last-author, created/modified timestamp, etc)

@Override
public Document parse(InputStream inputStream) {
try {
Parser parser = parserSupplier.get();
ContentHandler contentHandler = contentHandlerSupplier.get();
Metadata metadata = metadataSupplier.get();
ParseContext parseContext = parseContextSupplier.get();

parser.parse(inputStream, contentHandler, metadata, parseContext);
String text = contentHandler.toString();

Expand Down
@@ -1,9 +1,10 @@
package dev.langchain4j.data.document.parser.apache.tika;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.tika.parser.AutoDetectParser;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

Expand Down Expand Up @@ -40,7 +41,7 @@ void should_parse_doc_ppt_and_pdf_files(String fileName) {
})
void should_parse_xls_files(String fileName) {

DocumentParser parser = new ApacheTikaDocumentParser(new AutoDetectParser(), null, null, null);
DocumentParser parser = new ApacheTikaDocumentParser(AutoDetectParser::new, null, null, null);
InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName);

Document document = parser.parse(inputStream);
Expand All @@ -50,6 +51,24 @@ void should_parse_xls_files(String fileName) {
assertThat(document.metadata().asMap()).isEmpty();
}

@Test
void should_parse_files_stateless() {

DocumentParser parser = new ApacheTikaDocumentParser();
InputStream inputStream1 = getClass().getClassLoader().getResourceAsStream("test-file.xls");
InputStream inputStream2 = getClass().getClassLoader().getResourceAsStream("test-file.xls");

Document document1 = parser.parse(inputStream1);
Document document2 = parser.parse(inputStream2);

assertThat(document1.text())
.isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content");
assertThat(document2.text())
.isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content");
assertThat(document1.metadata().asMap()).isEmpty();
assertThat(document2.metadata().asMap()).isEmpty();
}

@ParameterizedTest
@ValueSource(strings = {
"empty-file.txt",
Expand Down

0 comments on commit f34c543

Please sign in to comment.