fix: parse URL response Content-Type according to RFC 9110 (#2950)

Unstructured-IO · Apr 30, 2024 · 0d80886 · 0d80886
1 parent 7720e72
commit 0d80886
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 0.13.6
+
+### Enhancements
+
+### Features
+
+### Fixes
+
+- **ValueError: Invalid file (FileType.UNK) when parsing Content-Type header with charset directive** URL response Content-Type headers are now parsed according to RFC 9110.
+
 ## 0.13.5
 
 ### Enhancements

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -609,6 +609,22 @@ def test_auto_partition_from_url():
     assert elements[0].metadata.url == url
 
 
+def test_auto_partition_from_url_with_rfc9110_content_type():
+    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
+    elements = partition(
+        url=url, content_type="text/plain; charset=utf-8", strategy=PartitionStrategy.HI_RES
+    )
+    assert elements[0] == Title("Apache License")
+    assert elements[0].metadata.url == url
+
+
+def test_auto_partition_from_url_without_providing_content_type():
+    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
+    elements = partition(url=url, strategy=PartitionStrategy.HI_RES)
+    assert elements[0] == Title("Apache License")
+    assert elements[0].metadata.url == url
+
+
 def test_partition_md_works_with_embedded_html():
     url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
     elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.13.5"  # pragma: no cover
+__version__ = "0.13.6"  # pragma: no cover
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -564,7 +564,9 @@ def file_and_type_from_url(
     response = requests.get(url, headers=headers, verify=ssl_verify, timeout=request_timeout)
     file = io.BytesIO(response.content)
 
-    content_type = content_type or response.headers.get("Content-Type")
+    content_type = (
+        content_type or response.headers.get("Content-Type", "").split(";")[0].strip().lower()
+    )
     encoding = response.headers.get("Content-Encoding", "utf-8")
 
     filetype = detect_filetype(file=file, content_type=content_type, encoding=encoding)