Blog 0x0A: Windows PowerShell (PoSh) to parse some HTML content.

Friday, February 05, 2010

Windows PowerShell (PoSh) to parse some HTML content.

#wps_lidimedia_v30.ps1
#Windows PowerShell script to upgrade LidiMedia content (resources to be displayed in a moodle platform).
#NOV-2009

$files = ls . -rec -Filter *.htm*

# 1) Parse and fix links and url's at href's and name's properties (replace accents).
Add-Type -Assembly System.Web
$i8 = [System.Text.Encoding]::GetEncoding("iso-8859-8")

function decody ($before, $after) {
$name = new-object regex "]+?name\s*=\s*(?[`"'])?(?.*?)(?(k)\k|[>\s\/])","Compiled,Multiline,IgnoreCase"
$href = new-object regex "]+?href\s*=\s*(?[`"'])?(?.*?)(?(k)\k|[>\s\/])","Compiled,Multiline,IgnoreCase"

#$ody = { $args | %{$i8.GetString($i8.GetBytes( [System.Web.HttpUtility]::HtmlDecode($_.Value) ))}}
$ody = { $args | %{$i8.GetString($i8.GetBytes( $_.Value ))}}
sc $after ( $href.Replace( $name.Replace( (gc $before -delim "%%%"), $ody ), $ody ) )#)
## [System.Web.HttpUtility]::HtmlEncode(
}

foreach ($file in $files)
{
decody $file.FullName $file.FullName
}

# 2) Replace accents for html entities in the rest of the document.
$replacements = @{
the_a = @([regex] "á", "á");
the_e = @([regex] "é", "é");
the_i = @([regex] "í", "í");
the_o = @([regex] "ó", "ó");
the_u = @([regex] "ú", "ú");
the_n = @([regex] "ñ", "ñ");
the_Am = @([regex] "Á", "Á");
the_Em = @([regex] "É", "É");
the_Im = @([regex] "Í", "Í");
the_Om = @([regex] "Ó", "Ó");
the_Um = @([regex] "Ú", "Ú");
the_Nm = @([regex] "Ñ", "Ñ");
the_DoubleAposNegative = @([regex] '“', """);
the_DoubleAposPositive = @([regex] '”', """)
}

foreach ($file in $files)
{
$content = Get-Content $file.FullName

foreach ($replacement in $replacements.keys)
{
$content -creplace $replacements[$replacement][0].ToString(), $replacements[$replacement][1] | Set-Content $file.FullName
$content = Get-Content $file.FullName
}
}

# 3) Lower case links and url's at href's and src's properties.
$re_double = New-Object regex '(?:href|src)\s*=\s*"(.*?)(#.*?)?"', "Compiled,Multiline,IgnoreCase"
$re_single = New-Object regex "(?:href|src)\s*=\s*'(.*?)(#.*?)?'", "Compiled,Multiline,IgnoreCase"
$patterns = @($re_double, $re_single)

foreach ($file in $files)
{
$content = Get-Content $file.FullName

foreach ($patt in $patterns)
{
$str_matches = $patt.Matches($content) | foreach {$_.Groups[1].Value}
#$str_matches
#Get-Type $str_matches
if ($str_matches -ne $null)
{
#$str_matches.GetType()
"[+] "+$str_matches.Count+" matches found in "+$file.FullName
$i = 0
foreach ($str_match in $str_matches)
{
"[+] Match "+$i+": "+$str_match
$i = $i+1
$content -creplace $str_match, $str_match.ToLower() | Set-Content $file.FullName
$content = Get-Content $file.FullName
}
}
}
}

# 4) Parse the '\' for '/' in links.
#$re_double = New-Object regex '(?:href|src)\s*=\s*"(.*?)(#.*?)?"', "Compiled,Multiline,IgnoreCase"
#$re_single = New-Object regex "(?:href|src)\s*=\s*'(.*?)(#.*?)?'", "Compiled,Multiline,IgnoreCase"
$re_double = New-Object regex '(?<=(?:href|src)="[^"]+)\\(?=[^ >]*")', "Compiled, Multiline, IgnoreCase"
$re_single = New-Object regex "(?<=(?:href|src)='[^']+)\\(?=[^ >]*')", "Compiled, Multiline, IgnoreCase"
$str_forwardslash = "/"
$patterns = @($re_double, $re_single)

foreach ($file in $files)
{
$content = Get-Content $file.FullName

foreach ($patt in $patterns)
{
sc $file.FullName ($content -replace $patt, $str_forwardslash)
$content = gc $file.FullName
}
}

# 5) Rename all files to lower case.
ls -rec | ren -new {"_$($_.Name.ToLower())"} -Pass | ren -new { $_.Name.Substring(1) } -Pass

#EOS

Blog 0x0A

Friday, February 05, 2010

Windows PowerShell (PoSh) to parse some HTML content.

No comments:

Blog Archive

Search This Blog

Followers