Skip to content

Descendants and CssSelect do not take all elements corectly #1130

@RemSoftDev

Description

@RemSoftDev

https://www.amazon.com/s/ref=lp_1_nr_n_0?fst=as%3Aoff&rh=n%3A283155%2Cn%3A%211000%2Cn%3A1%2Cn%3A173508&bbn=1&ie=UTF8&qid=1519043803&rnid=1

try to select all "ul". it will find always random count of "ul" elements within interval 12-19, but page always contain exactly 38 ULs.

or simply run this code

open System.Net
open System
open System.IO
open FSharp.Data
// Fetch the contents of a web page
let fetchUrl callback url =        
    let req = WebRequest.Create(Uri(url)) :?> HttpWebRequest 
    req.UserAgent <- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2";

    use resp = req.GetResponse() 
    use stream = resp.GetResponseStream() 
    use reader = new IO.StreamReader(stream)
    callback reader url

let GetStreamUrl pReauest pUrlBuilder pUrlPart =
    let fullUrl:string = pUrlBuilder pUrlPart
    pReauest fullUrl
    

let myCallback (reader:IO.StreamReader) url = 
    let html = reader.ReadToEnd()
    let html1000 = html.Substring(0,1000)
    printfn "Downloaded %s. First 1000 is %s" url html1000
    html      // return all the html

let GetLinkNameUrl pGetStreamUrl pUrlPart name listT = 
    let pUrlSteam:string = pGetStreamUrl pUrlPart
    let doc = HtmlDocument.Parse pUrlSteam
    let desc = doc.Descendants "ul"
    let css = doc.CssSelect "ul"
    ""

let FullUrl l r = 
    sprintf "%s%s" l r
[<EntryPoint>]
let main argv = 
    let baseUrl = "https://www.amazon.com"
    let url =  GetLinkNameUrl 
                (GetStreamUrl (fetchUrl myCallback) (FullUrl baseUrl))  
                "/s/ref=lp_1_nr_n_0?fst=as%3Aoff&rh=n%3A283155%2Cn%3A%211000%2Cn%3A1%2Cn%3A173508&bbn=1&ie=UTF8&qid=1519043803&rnid=1"
                "123" 
                List.empty
               
    printfn "%A" argv
    0 // return an integer exit code

let desc = doc.Descendants "ul"
let css = doc.CssSelect "ul"

should contains 38 elements.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions