Open SiteSearch 4.1.1
Final

ORG.oclc.resources.html
Class HTMLResource

java.lang.Object
  |
  +--ORG.oclc.resources.html.HTMLResource

public class HTMLResource
extends Object
implements NetworkResource


Field Summary
protected  boolean allContentChecksAreDone
           
protected  String cleanContent
           
protected  boolean cleanContentIsSet
           
protected  String closeTag
           
protected  boolean containsCode
           
protected  boolean containsCodeIsSet
           
protected  boolean containsForm
           
protected  boolean containsFormIsSet
           
protected  boolean containsFrames
           
protected  boolean containsFramesIsSet
           
protected  boolean containsImageMap
           
protected  boolean containsImageMapIsSet
           
protected  boolean containsLayers
           
protected  boolean containsLayersIsSet
           
protected  boolean containsMetaTags
           
protected  boolean containsMetaTagsIsSet
           
protected  boolean containsNewsCues
           
protected  boolean containsNewsCuesIsSet
           
protected  boolean containsStyles
           
protected  boolean containsStylesIsSet
           
protected  boolean containsTechnicalCues
           
protected  boolean containsTechnicalCuesIsSet
           
protected  boolean containsTemporalCues
           
protected  boolean containsTemporalCuesIsSet
           
protected  boolean containsXML
           
protected  boolean containsXMLIsSet
           
protected  String content
           
protected  String copyrightFromCRText
           
protected  boolean copyRightIsSet
           
protected  String dateFromCRText
           
protected  Vector descriptions
           
protected  boolean descriptionsIsSet
           
protected  String endOfOpenTag
           
protected  String excerpt
           
protected  boolean excerptIsSet
           
protected  Vector languages
           
protected  boolean languagesIsSet
           
protected  int linkRatio
           
protected  boolean linkRatioIsSet
           
protected  Vector links
           
protected  boolean looksLike404
           
protected  boolean looksLike404IsSet
           
protected  String lowerCaseContent
           
protected  boolean lowerCaseContentIsSet
           
protected  Vector metaTags
           
protected  int noTagsLength
           
protected  int noTagsNoAnchorsLength
           
protected  Vector oneoffs
           
protected  String openTag
           
protected  boolean processedTagsIsSet
           
protected  String pubFromCRText
           
protected  String rdf
           
protected  String startOfOpenTag
           
protected  String title
           
protected  boolean titleIsSet
           
protected  Vector xmlTags
           
 
Constructor Summary
HTMLResource(String doc)
           
 
Method Summary
 String cleanChunk(String chunk, int len)
           
 boolean containsCode()
           
 boolean containsForm()
           
 boolean containsFrames()
           
 boolean containsImageMap()
           
 boolean containsLayers()
           
 boolean containsMetaTags()
           
 boolean containsNewsCues()
           
 boolean containsPattern(String uncompiledPattern, int limit)
           
 boolean containsStyles()
           
 boolean containsTechnicalCues()
           
 boolean containsTemporalCues()
           
 boolean containsXML()
           
 void doAllContentChecks()
           
 String getCleanContent()
           
 String getContent()
           
 String getCopyrightFromText()
           
 Vector getDescriptions()
          Returns the content attributes from meta tags that have a name or http-equiv attribute that contains "description".
 String getExcerpt()
           
 String getExcerpt(boolean refresh)
           
 String getExcerpt(int limit)
           
 String getExcerpt(int limit, boolean refresh)
           
 Vector getLanguages()
          Returns the content attributes from meta tags that have a name or http-equiv attribute that contains "language".
 int getLinkRatio()
           
 Vector getLinks()
           
 String getLowerCaseContent()
           
 Vector getMetaTags()
           
 Vector getOneoffs()
           
 String getPublisherFromText()
           
 String getRDF()
           
 String getTitle()
           
 Vector getXMLTags()
           
 boolean looksLike404()
           
static void main(String[] args)
           
 void processTags()
           
protected  void setLowerCaseContent()
           
 String toString()
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
 

Field Detail

title

protected String title

titleIsSet

protected boolean titleIsSet

copyRightIsSet

protected boolean copyRightIsSet

pubFromCRText

protected String pubFromCRText

dateFromCRText

protected String dateFromCRText

copyrightFromCRText

protected String copyrightFromCRText

excerpt

protected String excerpt

excerptIsSet

protected boolean excerptIsSet

content

protected String content

cleanContent

protected String cleanContent

cleanContentIsSet

protected boolean cleanContentIsSet

lowerCaseContent

protected String lowerCaseContent

lowerCaseContentIsSet

protected boolean lowerCaseContentIsSet

noTagsLength

protected int noTagsLength

noTagsNoAnchorsLength

protected int noTagsNoAnchorsLength

linkRatio

protected int linkRatio

linkRatioIsSet

protected boolean linkRatioIsSet

allContentChecksAreDone

protected boolean allContentChecksAreDone

containsMetaTagsIsSet

protected boolean containsMetaTagsIsSet

containsMetaTags

protected boolean containsMetaTags

containsXMLIsSet

protected boolean containsXMLIsSet

containsXML

protected boolean containsXML

containsFormIsSet

protected boolean containsFormIsSet

containsForm

protected boolean containsForm

containsFramesIsSet

protected boolean containsFramesIsSet

containsFrames

protected boolean containsFrames

containsImageMapIsSet

protected boolean containsImageMapIsSet

containsImageMap

protected boolean containsImageMap

containsLayersIsSet

protected boolean containsLayersIsSet

containsLayers

protected boolean containsLayers

containsStylesIsSet

protected boolean containsStylesIsSet

containsStyles

protected boolean containsStyles

containsTemporalCuesIsSet

protected boolean containsTemporalCuesIsSet

containsTemporalCues

protected boolean containsTemporalCues

containsNewsCuesIsSet

protected boolean containsNewsCuesIsSet

containsNewsCues

protected boolean containsNewsCues

containsTechnicalCuesIsSet

protected boolean containsTechnicalCuesIsSet

containsTechnicalCues

protected boolean containsTechnicalCues

containsCodeIsSet

protected boolean containsCodeIsSet

containsCode

protected boolean containsCode

looksLike404IsSet

protected boolean looksLike404IsSet

looksLike404

protected boolean looksLike404

processedTagsIsSet

protected boolean processedTagsIsSet

rdf

protected String rdf

oneoffs

protected Vector oneoffs

links

protected Vector links

metaTags

protected Vector metaTags

xmlTags

protected Vector xmlTags

descriptions

protected Vector descriptions

descriptionsIsSet

protected boolean descriptionsIsSet

languages

protected Vector languages

languagesIsSet

protected boolean languagesIsSet

startOfOpenTag

protected String startOfOpenTag

endOfOpenTag

protected String endOfOpenTag

openTag

protected String openTag

closeTag

protected String closeTag
Constructor Detail

HTMLResource

public HTMLResource(String doc)
Method Detail

getTitle

public String getTitle()
Specified by:
getTitle in interface NetworkResource

getPublisherFromText

public String getPublisherFromText()
Specified by:
getPublisherFromText in interface NetworkResource

getCopyrightFromText

public String getCopyrightFromText()
Specified by:
getCopyrightFromText in interface NetworkResource

getExcerpt

public String getExcerpt(boolean refresh)

getExcerpt

public String getExcerpt()
Specified by:
getExcerpt in interface NetworkResource

getExcerpt

public String getExcerpt(int limit,
                         boolean refresh)

getExcerpt

public String getExcerpt(int limit)
Specified by:
getExcerpt in interface NetworkResource

getContent

public String getContent()
Specified by:
getContent in interface NetworkResource

setLowerCaseContent

protected void setLowerCaseContent()

getLowerCaseContent

public String getLowerCaseContent()
Specified by:
getLowerCaseContent in interface NetworkResource

cleanChunk

public String cleanChunk(String chunk,
                         int len)

getCleanContent

public String getCleanContent()
Specified by:
getCleanContent in interface NetworkResource

getLinkRatio

public int getLinkRatio()
Specified by:
getLinkRatio in interface NetworkResource

toString

public String toString()
Specified by:
toString in interface NetworkResource
Overrides:
toString in class Object

containsCode

public boolean containsCode()

containsTechnicalCues

public boolean containsTechnicalCues()

containsTemporalCues

public boolean containsTemporalCues()

containsNewsCues

public boolean containsNewsCues()

containsMetaTags

public boolean containsMetaTags()

containsXML

public boolean containsXML()

containsForm

public boolean containsForm()

containsFrames

public boolean containsFrames()

containsImageMap

public boolean containsImageMap()

containsLayers

public boolean containsLayers()

containsStyles

public boolean containsStyles()

looksLike404

public boolean looksLike404()

containsPattern

public boolean containsPattern(String uncompiledPattern,
                               int limit)

doAllContentChecks

public void doAllContentChecks()

processTags

public void processTags()

getMetaTags

public Vector getMetaTags()

getXMLTags

public Vector getXMLTags()

getOneoffs

public Vector getOneoffs()

getLinks

public Vector getLinks()

getRDF

public String getRDF()

main

public static void main(String[] args)

getDescriptions

public Vector getDescriptions()
Returns the content attributes from meta tags that have a name or http-equiv attribute that contains "description".

getLanguages

public Vector getLanguages()
Returns the content attributes from meta tags that have a name or http-equiv attribute that contains "language".

Open SiteSearch 4.1.1
Final