Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add urlscan as a passive source #859

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
29 changes: 29 additions & 0 deletions cmd/katana/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,14 @@ import (
"github.com/projectdiscovery/goflags"
"github.com/projectdiscovery/gologger"
"github.com/projectdiscovery/katana/internal/runner"
"github.com/projectdiscovery/katana/pkg/engine/passive"
"github.com/projectdiscovery/katana/pkg/output"
"github.com/projectdiscovery/katana/pkg/types"
errorutil "github.com/projectdiscovery/utils/errors"
fileutil "github.com/projectdiscovery/utils/file"
folderutil "github.com/projectdiscovery/utils/folder"
"github.com/rs/xid"
"gopkg.in/yaml.v2"
)

var (
Expand Down Expand Up @@ -107,6 +110,7 @@ pipelines offering both headless and non-headless crawling.`)
flagSet.BoolVarP(&options.IgnoreQueryParams, "ignore-query-params", "iqp", false, "Ignore crawling same path with different query-param values"),
flagSet.BoolVarP(&options.TlsImpersonate, "tls-impersonate", "tlsi", false, "enable experimental client hello (ja3) tls randomization"),
flagSet.BoolVarP(&options.DisableRedirects, "disable-redirects", "dr", false, "disable following redirects (default false)"),
flagSet.StringVarP(&options.PassiveProviderConfig, "passive-provider-config", "ppc", folderutil.AppConfigDirOrDefault("", "katana/passive-providers-config.yaml"), "provider config file"),
)

flagSet.CreateGroup("debug", "Debug",
Expand Down Expand Up @@ -182,6 +186,12 @@ pipelines offering both headless and non-headless crawling.`)
return nil, errorutil.NewWithErr(err).Msgf("could not parse flags")
}

if exists := fileutil.FileExists(options.PassiveProviderConfig); !exists {
if err := createPassiveProviderConfigYAML(options.PassiveProviderConfig); err != nil {
gologger.Error().Msgf("Could not create provider config file: %s\n", err)
}
}

if cfgFile != "" {
if err := flagSet.MergeConfigFile(cfgFile); err != nil {
return nil, errorutil.NewWithErr(err).Msgf("could not read config file")
Expand All @@ -192,6 +202,25 @@ pipelines offering both headless and non-headless crawling.`)
return flagSet, nil
}

// createProviderConfigYAML marshals the input map to the given location on the disk
func createPassiveProviderConfigYAML(configFilePath string) error {
configFile, err := os.Create(configFilePath)
if err != nil {
return err
}
defer configFile.Close()

sourcesRequiringApiKeysMap := make(map[string][]string)
for _, source := range passive.Sources {
if source.NeedsKey() {
sourceName := strings.ToLower(source.Name())
sourcesRequiringApiKeysMap[sourceName] = []string{}
}
}

return yaml.NewEncoder(configFile).Encode(sourcesRequiringApiKeysMap)
}

func init() {
// show detailed stacktrace in debug mode
if os.Getenv("DEBUG") == "true" {
Expand Down
33 changes: 33 additions & 0 deletions pkg/engine/passive/passive.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ package passive

import (
"context"
"errors"
"fmt"
"net/http"
"os"
"strings"
"sync"
"time"
Expand All @@ -16,8 +18,10 @@ import (
"github.com/projectdiscovery/katana/pkg/types"
"github.com/projectdiscovery/katana/pkg/utils"
errorutil "github.com/projectdiscovery/utils/errors"
fileutil "github.com/projectdiscovery/utils/file"
urlutil "github.com/projectdiscovery/utils/url"
"golang.org/x/exp/maps"
"gopkg.in/yaml.v2"
)

// Crawler is a passive crawler instance
Expand All @@ -34,6 +38,15 @@ func New(options *types.CrawlerOptions) (*Crawler, error) {
return nil, errorutil.NewWithErr(err).WithTag("passive")
}

// Load the passive providers info from the file
if options.Options.Passive && fileutil.FileExists(options.Options.PassiveProviderConfig) {
gologger.Info().Msgf("Loading provider config from %s", options.Options.PassiveProviderConfig)

if err := loadPassiveProvidersFrom(options.Options.PassiveProviderConfig); err != nil && (!strings.Contains(err.Error(), "file doesn't exist") || errors.Is(os.ErrNotExist, err)) {
gologger.Error().Msgf("Could not read providers from %s: %s\n", options.Options.PassiveProviderConfig, err)
}
}

sources := make(map[string]source.Source, len(Sources))
if len(options.Options.PassiveSource) > 0 {
for _, source := range options.Options.PassiveSource {
Expand Down Expand Up @@ -143,3 +156,23 @@ func (c *Crawler) Crawl(rootURL string) error {
gologger.Info().Msgf("Found %d endpoints for %s in %s (%s)", len(seenURLs), rootURL, timeTaken.String(), strings.Join(stats, ", "))
return nil
}

// loadPassiveProvidersFrom loads the passive providers from a file
func loadPassiveProvidersFrom(file string) error {
reader, err := fileutil.SubstituteConfigFromEnvVars(file)
if err != nil {
return err
}

sourceApiKeysMap := map[string][]string{}
err = yaml.NewDecoder(reader).Decode(sourceApiKeysMap)
for _, source := range Sources {
sourceName := strings.ToLower(source.Name())
apiKeys := sourceApiKeysMap[sourceName]
if source.NeedsKey() && apiKeys != nil && len(apiKeys) > 0 {
gologger.Debug().Msgf("API key(s) found for %s.", sourceName)
source.AddApiKeys(apiKeys)
}
}
return err
}
2 changes: 2 additions & 0 deletions pkg/engine/passive/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ import (
"github.com/projectdiscovery/katana/pkg/engine/passive/source"
"github.com/projectdiscovery/katana/pkg/engine/passive/source/alienvault"
"github.com/projectdiscovery/katana/pkg/engine/passive/source/commoncrawl"
"github.com/projectdiscovery/katana/pkg/engine/passive/source/urlscan"
"github.com/projectdiscovery/katana/pkg/engine/passive/source/waybackarchive"
)

var Sources = map[string]source.Source{
"waybackarchive": &waybackarchive.Source{},
"commoncrawl": &commoncrawl.Source{},
"alienvault": &alienvault.Source{},
"urlscan": &urlscan.Source{},
}
109 changes: 109 additions & 0 deletions pkg/engine/passive/source/urlscan/urlscan.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package urlscan

import (
"context"
"fmt"
"net/http"
"strconv"

jsoniter "github.com/json-iterator/go"
"github.com/projectdiscovery/katana/pkg/engine/common"
"github.com/projectdiscovery/katana/pkg/engine/passive/httpclient"
"github.com/projectdiscovery/katana/pkg/engine/passive/source"
urlutil "github.com/projectdiscovery/utils/url"
)

type response struct {
Results []Result `json:"results"`
HasMore bool `json:"has_more"`
}

type Result struct {
Page Page `json:"page"`
Sort []interface{} `json:"sort"`
}

type Page struct {
Url string `json:"url"`
}

type Source struct {
apiKeys []string
}

func (s *Source) Run(ctx context.Context, sharedCtx *common.Shared, rootUrl string) <-chan source.Result {
results := make(chan source.Result)

go func() {
defer close(results)

if parsedRootUrl, err := urlutil.Parse(rootUrl); err == nil {
rootUrl = parsedRootUrl.Hostname()
}
httpClient := httpclient.NewHttpClient(sharedCtx.Options.Options.Timeout)

randomApiKey := source.PickRandom(s.apiKeys, s.Name())
if randomApiKey == "" {
return
}

var searchAfter string
hasMore := true
headers := map[string]string{"API-Key": randomApiKey}
apiURL := fmt.Sprintf("https://urlscan.io/api/v1/search/?q=domain:%s&size=10000", rootUrl)
for hasMore {
if searchAfter != "" {
apiURL = fmt.Sprintf("%s&search_after=%s", apiURL, searchAfter)
}

resp, err := httpClient.Get(ctx, apiURL, "", headers)
if err != nil {
results <- source.Result{Source: s.Name(), Error: err}
httpClient.DiscardHTTPResponse(resp)
return
}

var data response
err = jsoniter.NewDecoder(resp.Body).Decode(&data)
if err != nil {
results <- source.Result{Source: s.Name(), Error: err}
resp.Body.Close()
return
}
resp.Body.Close()

if resp.StatusCode == http.StatusTooManyRequests {
results <- source.Result{Source: s.Name(), Error: fmt.Errorf("urlscan rate limited")}
return
}

for _, url := range data.Results {
results <- source.Result{Source: s.Name(), Value: url.Page.Url, Reference: apiURL}
}
if len(data.Results) > 0 {
lastResult := data.Results[len(data.Results)-1]
if len(lastResult.Sort) > 0 {
sort1 := strconv.Itoa(int(lastResult.Sort[0].(float64)))
sort2, _ := lastResult.Sort[1].(string)

searchAfter = fmt.Sprintf("%s,%s", sort1, sort2)
}
}
hasMore = data.HasMore
}
}()

return results
}

func (s *Source) Name() string {
return "urlscan"
}

func (s *Source) NeedsKey() bool {
return true
}

func (s *Source) AddApiKeys(keys []string) {
s.apiKeys = keys
}
42 changes: 42 additions & 0 deletions pkg/engine/passive/source/utils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package source

import (
"math/rand"
"strings"

"github.com/projectdiscovery/gologger"
)

const MultipleKeyPartsLength = 2

func PickRandom[T any](v []T, sourceName string) T {
var result T
length := len(v)
if length == 0 {
gologger.Debug().Msgf("Cannot use the %s source because there was no API key/secret defined for it.", sourceName)
return result
}
return v[rand.Intn(length)]
}

func CreateApiKeys[T any](keys []string, provider func(k, v string) T) []T {
var result []T
for _, key := range keys {
if keyPartA, keyPartB, ok := createMultiPartKey(key); ok {
result = append(result, provider(keyPartA, keyPartB))
}
}
return result
}

func createMultiPartKey(key string) (keyPartA, keyPartB string, ok bool) {
parts := strings.Split(key, ":")
ok = len(parts) == MultipleKeyPartsLength

if ok {
keyPartA = parts[0]
keyPartB = parts[1]
}

return
}
2 changes: 2 additions & 0 deletions pkg/types/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ type Options struct {
TlsImpersonate bool
//DisableRedirects disables the following of redirects
DisableRedirects bool
// PassiveProviderConfig is the path to the passive provider configuration file
PassiveProviderConfig string
}

func (options *Options) ParseCustomHeaders() map[string]string {
Expand Down