Skip to content

Commit

Permalink
[BUG] Introduce parser supplier support in FileSystemDocumentLoader#l…
Browse files Browse the repository at this point in the history
…oadDocuments langchain4j#1026
  • Loading branch information
KaisNeffati committed May 3, 2024
1 parent a85a0d5 commit fe1181d
Showing 1 changed file with 15 additions and 10 deletions.
@@ -1,7 +1,7 @@
package dev.langchain4j.data.document.parser.apache.tika;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.metadata.Metadata;
Expand Down Expand Up @@ -60,7 +60,12 @@ public ApacheTikaDocumentParser(Parser parser,
ContentHandler contentHandler,
Metadata metadata,
ParseContext parseContext) {
this(() -> parser, () -> contentHandler, () -> metadata, () -> parseContext);
this(
() -> getOrDefault(parser, DEFAULT_PARSER_SUPPLIER),
() -> getOrDefault(contentHandler, DEFAULT_CONTENT_HANDLER_SUPPLIER),
() -> getOrDefault(metadata, DEFAULT_METADATA_SUPPLIER),
() -> getOrDefault(parseContext, DEFAULT_PARSE_CONTEXT_SUPPLIER)
);
}

/**
Expand All @@ -76,21 +81,21 @@ public ApacheTikaDocumentParser(Supplier<Parser> parserSupplier,
Supplier<ContentHandler> contentHandlerSupplier,
Supplier<Metadata> metadataSupplier,
Supplier<ParseContext> parseContextSupplier) {
this.parserSupplier = parserSupplier;
this.contentHandlerSupplier = contentHandlerSupplier;
this.metadataSupplier = metadataSupplier;
this.parseContextSupplier = parseContextSupplier;
this.parserSupplier = getOrDefault(parserSupplier, () -> DEFAULT_PARSER_SUPPLIER);
this.contentHandlerSupplier = getOrDefault(contentHandlerSupplier, () -> DEFAULT_CONTENT_HANDLER_SUPPLIER);
this.metadataSupplier = getOrDefault(metadataSupplier, () -> DEFAULT_METADATA_SUPPLIER);
this.parseContextSupplier = getOrDefault(parseContextSupplier, () -> DEFAULT_PARSE_CONTEXT_SUPPLIER);
}

// TODO allow automatically extract metadata (e.g. creator, last-author, created/modified timestamp, etc)

@Override
public Document parse(InputStream inputStream) {
try {
Parser parser = getOrDefault(parserSupplier, () -> DEFAULT_PARSER_SUPPLIER).get();
ContentHandler contentHandler = getOrDefault(contentHandlerSupplier, () -> DEFAULT_CONTENT_HANDLER_SUPPLIER).get();
Metadata metadata = getOrDefault(metadataSupplier, () -> DEFAULT_METADATA_SUPPLIER).get();
ParseContext parseContext = getOrDefault(parseContextSupplier, () -> DEFAULT_PARSE_CONTEXT_SUPPLIER).get();
Parser parser = parserSupplier.get();
ContentHandler contentHandler = contentHandlerSupplier.get();
Metadata metadata = metadataSupplier.get();
ParseContext parseContext = parseContextSupplier.get();

parser.parse(inputStream, contentHandler, metadata, parseContext);
String text = contentHandler.toString();
Expand Down

0 comments on commit fe1181d

Please sign in to comment.