1- <#
2- . SYNOPSIS
3- Takes an HTML input and converts it to an HTMLAgilityPack htmlNode object that can be navigated using Linq
4- . DESCRIPTION
5- Long description
6- . EXAMPLE
7- PS C:\> $HTMLString = @"
8- <!DOCTYPE html>
9- <html>
10- <body>
11- <h1>My First Heading</h1>
12- <p>My first paragraph.</p>d
13- </body>
14- </html>
15- "@
16- PS C:\> $HTMLString | ConvertFrom-HTML -OutVariable result
17-
18- NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
19- -------- ---- -------------- -------------- ------------- ---------
20- Document #document 0 4 103 …
21-
22- PS C:\> $result.SelectSingleNode("//body/h1")
23-
24- NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
25- -------- ---- -------------- -------------- ------------- ---------
26- Element h1 0 1 16 My First Heading
27-
28- Convert HTML string to a HtmlNode via the pipeline.
29-
30- . EXAMPLE
31- PS C:\> $uri = "https://www.powershellgallery.com/"
32- PS C:\> $result = ConvertFrom-HTML -uri $uri
33- PS C:\> $result
34-
35- NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
36- -------- ---- -------------- -------------- ------------- ---------
37- Document #document 0 4 17550 …
38-
39- Fetch and parse $uri directly via the URI pipeline.
40- . EXAMPLE
41- PS C:\> Get-Item $testFilePath | ConvertFrom-Html
42-
43- NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
44- -------- ---- -------------- -------------- ------------- ---------
45- Document #document 0 5 105 …
46-
47- Parse an HTML file piped from Get-Item.
48- . INPUTS
49- [String[]]
50- [System.IO.FileInfo[]]
51- . OUTPUTS
52- [HtmlAgilityPack.HtmlDocument]
53- [HtmlAgilityPack.HtmlNode]
54- . NOTES
55- General notes
56- #>
1+
572function ConvertFrom-Html {
58- [CmdletBinding (DefaultParameterSetName = " String" )]
59- param (
3+ <#
4+ . SYNOPSIS
5+ Takes an HTML input and converts it to an HTMLAgilityPack htmlNode object that can be navigated using Linq
6+ . DESCRIPTION
7+ Long description
8+ . EXAMPLE
9+ $HTMLString = @'
10+ <!DOCTYPE html>
11+ <html>
12+ <body>
13+ <h1>My First Heading</h1>
14+ <p>My first paragraph.</p>d
15+ </body>
16+ </html>
17+ '@ | ConvertFrom-HTML
18+
19+ $HTMLString
20+
21+ NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
22+ -------- ---- -------------- -------------- ------------- ---------
23+ Document #document 0 4 103 …
24+
25+ $HTMLString.SelectSingleNode('//body/h1')
26+
27+ NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
28+ -------- ---- -------------- -------------- ------------- ---------
29+ Element h1 0 1 16 My First Heading
30+
31+ Convert HTML string to a HtmlNode via the pipeline.
32+
33+ . EXAMPLE
34+ $uri = [Uri]'https://www.powershellgallery.com/' | ConvertFrom-HTML
35+ $uri
36+
37+ NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
38+ -------- ---- -------------- -------------- ------------- ---------
39+ Document #document 0 4 17550 …
40+
41+ Fetch and parse a url.
42+ . EXAMPLE
43+ Get-Item $testFilePath | ConvertFrom-Html
44+
45+ NodeType Name AttributeCount ChildNodeCount ContentLength InnerText
46+ -------- ---- -------------- -------------- ------------- ---------
47+ Document #document 0 5 105 …
48+
49+ Parse an HTML file piped from Get-Item.
50+ . INPUTS
51+ [String[]]
52+ [System.IO.FileInfo[]]
53+ [System.URI[]]
54+ . OUTPUTS
55+ [HtmlAgilityPack.HtmlDocument]
56+ [HtmlAgilityPack.HtmlNode]
57+ . NOTES
58+ General notes
59+ #>
60+ [OutputType ([HtmlAgilityPack.HtmlNode ])]
61+ [OutputType ([HtmlAgilityPack.HtmlDocument ])]
62+ [CmdletBinding (DefaultParameterSetName = ' String' )]
63+ param (
6064 # The HTML text to parse. Accepts multiple separate documents as an array. This also accepts pipeline from Invoke-WebRequest
61- [Parameter (ParameterSetName = " String" , Mandatory , ValueFromPipeline , ValueFromPipelineByPropertyName , Position = 0 )]
62- [String []]$Content ,
65+ [Parameter (ParameterSetName = ' String' , Mandatory , ValueFromPipeline , ValueFromPipelineByPropertyName , Position = 0 )]
66+ [String []] $Content ,
6367
6468 # The URI or URIs from which to retrieve content. This may be faster than using Invoke-WebRequest but is less flexible in the method of retrieval (for instance, no POST)
65- [Parameter (ParameterSetName = " URI" , Mandatory , ValueFromPipeline , ValueFromPipelineByPropertyName )]
66- [System.URI []]$URI ,
69+ [Parameter (ParameterSetName = ' URI' , Mandatory , ValueFromPipeline , ValueFromPipelineByPropertyName , Position = 0 )]
70+ [System.URI []] $URI ,
6771
6872 # Path to file or files containing HTML content to convert. This accepts pipeline from Get-Childitem or Get-Item
69- [Parameter (ParameterSetName = " Path" , Mandatory , ValueFromPipeline , ValueFromPipelineByPropertyName )]
70- [System.IO.FileInfo []]$Path ,
73+ [Parameter (ParameterSetName = ' Path' , Mandatory , ValueFromPipeline , ValueFromPipelineByPropertyName , Position = 0 )]
74+ [System.IO.FileInfo []] $Path ,
7175
7276 # Do not return the Linq documentnode, instead return the HTMLDocument object. This is useful if you want to do XPath queries instead of Linq queries
73- [switch ]$Raw
74-
77+ [switch ] $Raw
7578 )
76-
7779 begin {
80+ $html = [HtmlAgilityPack.HtmlDocument ]::new()
81+ $web = [HtmlAgilityPack.HtmlWeb ]::new()
7882 }
79-
8083 process {
81- # Find the type of input and bind it to inputObject
82- $inputObject = $null
83- foreach ($contentType in " Content" , " URI" , " Path" ) {
84- if ((Get-Variable - erroraction SilentlyContinue $contentType ).value) {
85- $inputObject = (Get-Variable $contentType ).value
86- break
87- }
88- }
89- if (-not $inputObject ) {write-error " Input Object Type Not Identified. If you see this then ConvertFrom-HTML needs better input validation" }
90-
91- # Unwrap any arrays. This allows us to accept both pipeline and parameter input
92- $inputObject | ForEach-Object {
93- $inputItem = $PSItem
94- $htmlDoc = new-object HtmlAgilityPack.HtmlDocument
95-
96- # Process all object types into a common HTML document format
97- switch ($inputItem.GetType ().FullName) {
98- " System.String" {
99- $htmlDoc.LoadHtml ($inputItem )
84+ switch ($PSCmdlet.ParameterSetName ) {
85+ ' String' {
86+ $Content | ForEach-Object {
87+ Write-Verbose " Loading HTML"
88+ $html.LoadHtml ($_ )
89+ if ($Raw ) { $html } else { $html.DocumentNode }
10090 }
101- " System.Uri" {
102- $htmlDoc = (new-object HtmlAgilityPack.HtmlWeb).Load($inputItem )
103- }
104- " System.IO.FileInfo" {
105- $htmlDoc.Load ($inputItem )
106- }
107- Default {
108- write-error " Object Type not supported or implemented. If you see this error then ConvertFrom-HTML has improper input validation"
109- continue
91+ }
92+ ' URI' {
93+ $URI | ForEach-Object {
94+ Write-Verbose " Loading URI $_ "
95+ $site = $web.Load ($_ )
96+ if ($Raw ) { $site } else { $site.DocumentNode }
11097 }
11198 }
112- if ( $inputItem ) {
113- if ( $Raw ) {
114- $htmlDoc
115- } else {
116- $htmlDoc .DocumentNode
99+ ' Path ' {
100+ $Path | ForEach-Object {
101+ Write-Verbose " Loading File $_ "
102+ $html .Load ( $_ .FullName )
103+ if ( $Raw ) { $html } else { $html .DocumentNode }
117104 }
118105 }
119106 }
120-
121107 }
122- }
108+
109+ }
0 commit comments