From 3620f8e1e86d38a572a8379658fdbf3888ecc3e0 Mon Sep 17 00:00:00 2001 From: barkat-10 Date: Fri, 2 May 2025 13:33:09 -0500 Subject: [PATCH 1/2] enricher part half complete --- crawler/enricher.ts | 191 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 crawler/enricher.ts diff --git a/crawler/enricher.ts b/crawler/enricher.ts new file mode 100644 index 0000000..90cf9f3 --- /dev/null +++ b/crawler/enricher.ts @@ -0,0 +1,191 @@ +import {Builder} from 'selenium-webdriver'; // Builder() is a Selenium class used to construct a WebDriver instance. +import chrome, { ServiceBuilder } from 'selenium-webdriver/chrome'; //we are supposed to mention the browser we will use +import chromedriver from 'chromedriver'; +import { Client } from 'pg'; +import 'dotenv/config'; // get the dot env file + +const client = new Client({ + host: process.env.PGHOST, + port: parseInt(process.env.PGPORT || '5432', 10), + user: process.env.PGUSER, + password: process.env.PGPASSWORD, + database: process.env.PGDATABASE + }); + + async function main(){ + try{ + + await client.connect(); + console.log('Connected to PostgreSQL'); + + const fullName = 'Aashna Gajaria'; + + const crawlerRes = await client.query( + `SELECT json FROM crawler_data WHERE json->>'fullName' = $1 LIMIT 1`, + [fullName] + ); + + if (crawlerRes.rows.length === 0) { + console.log(`No crawler data found for ${fullName}`); + return; + } + + const data = crawlerRes.rows[0].json; + + // Step 2: Insert relevant fields into enricher_data + await client.query(` + INSERT INTO enricher_data ( + profile_url, + timestamp, + full_name, + email, + phone_number, + high_school, + hs_graduation_year, + naf_academy, + naf_track_certified, + city, + current_job, + university, + degree, + linkedin_link, + university_grad_year, + internship_company1, + internship_end_date1 + ) VALUES ( + $1, CURRENT_TIMESTAMP, $2, $3, $4, $5, $6, $7, $8, + $9, $10, $11, $12, $13, $14, $15, NULL + ); + `, [ + data.linkedinLink, // profile_url + data.fullName, // full_name + data.email, // email + data.phoneNumber, // phone_number + data.highSchool, // high_school + data.HSGraduationYear, // hs_graduation_year + data.NAFAcademy, // naf_academy + data.NAFTrackCertified, // naf_track_certified + data.city, // city + data.currentJob, // current_job + data.university, // university + data.degree, // degree + data.linkedinLink, // linkedin_link + data.universityGradYear, // university_grad_year + data.internshipCompany1 // internship_company1 + ]); + + console.log(`Successfully enriched data for ${fullName}`); + + } + catch (err) { + console.error('Error:', err); + } + finally{ + await client.end(); + } + } + + main(); + +const serviceBuilder = new chrome.ServiceBuilder(chromedriver.path); +const Options = chrome.Options; //The Chrome-specific configuration options you can pass to customize how Chrome runs when used with Selenium WebDriver. + +interface Person{ + fullName: String; + email?: string; + phone?: string; + school?: string; +} + +interface SearchResult { + items?: { link: string }[]; + error?: any; + } +const person: Person = { + fullName: 'Shahreen Iqbal', + email: 'singhbarkat1011@gmail.com', + phone: '', + school: 'University Of Texas, Dallas', +} + + +async function findLinkedinProfile(person: Person): Promise { + const { fullName, school } = person; + const [firstName, ...lastNameParts] = fullName.split(" "); + const lastName = lastNameParts.join(" "); + + const apiKey = process.env.API_KEY; + const searchEngineId = process.env.SEARCH_ENGINE_ID; + + if (!apiKey || !searchEngineId) { + console.error('API_KEY or SEARCH_ENGINE_ID not set in .env'); + process.exit(1); + } + + const queryVariations = [ + `site:linkedin.com/in "${fullName}"`, + `site:linkedin.com/in "${firstName} ${lastName}"`, + school ? `site:linkedin.com/in "${fullName}" "${school}"` : '', + school ? `site:linkedin.com/in "${firstName} ${lastName}" "${school}"` : '' + ].filter(Boolean); // Remove empty strings + + const chromeOptions = new Options(); + chromeOptions.addArguments( + '--headless=new', + '--disable-gpu', + '--no-sandbox', + '--disable-application-cache', + '--disable-extensions', + '--disable-notifications', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disk-cache-size=0', + '--media-cache-size=0', + '--aggressive-cache-discard' + ); + + const driver = await new Builder() + .forBrowser('chrome') + .setChromeOptions(chromeOptions) + .setChromeService(serviceBuilder) + .build(); + + try { + const allResults: any[] = []; + const seenUrls = new Set(); + + for (const query of queryVariations) { + console.log(`Searching with query: ${query}`); + const url = `https://www.googleapis.com/customsearch/v1?key=${apiKey}&cx=${searchEngineId}&q=${encodeURIComponent(query)}&num=5&${Date.now()}`; + + const data = await driver.executeAsyncScript(function (url: string, callback: Function) { + fetch(url) + .then(res => res.json()) + .then(result => callback(result)) + .catch(err => callback({ error: err.toString() })); + }, url) as SearchResult; + + const results = (data.items || []).filter((item: any) => + item.link.includes('linkedin.com/in') && !seenUrls.has(item.link) + ); + + for (const result of results) { + seenUrls.add(result.link); + allResults.push(result); + } + } + + if (allResults.length === 0) { + console.log('No LinkedIn profiles found.'); + } else { + console.log(`Top LinkedIn Matches for "${fullName}":`); + allResults.forEach((item, index) => { + console.log(`${index + 1}. ${item.link}`); + }); + } + } finally { + await driver.quit(); + } +} + +findLinkedinProfile(person).catch(console.error); \ No newline at end of file From c49951288fffc704236f73c871d4d770e616e649 Mon Sep 17 00:00:00 2001 From: barkat-10 Date: Wed, 7 May 2025 16:24:19 -0500 Subject: [PATCH 2/2] Enricher component done --- crawler/LinkedinScraper.ts | 450 +++++++++++++++++++++++++++++++++++++ crawler/enricher.ts | 209 ++++++++++------- 2 files changed, 575 insertions(+), 84 deletions(-) create mode 100644 crawler/LinkedinScraper.ts diff --git a/crawler/LinkedinScraper.ts b/crawler/LinkedinScraper.ts new file mode 100644 index 0000000..ee7a333 --- /dev/null +++ b/crawler/LinkedinScraper.ts @@ -0,0 +1,450 @@ +import { WebDriver } from 'selenium-webdriver'; +import * as fs from 'fs'; +import * as path from 'path'; +import { load } from 'cheerio'; +import type { CheerioAPI } from 'cheerio'; +import { Client } from 'pg'; + +const pagesDir = path.join(process.cwd(), 'enricher_pages'); +const jsonDir = path.join(process.cwd(), 'enricher_json'); + +export class EnricherLinkedInScraper { + private driver: WebDriver; + private maxRetries: number; + private retryDelay: number; + + constructor(driver: WebDriver, maxRetries = 5, retryDelay = 2000) { + this.driver = driver; + this.maxRetries = maxRetries; + this.retryDelay = retryDelay; + + // Ensure directories exist + if (!fs.existsSync(pagesDir)) fs.mkdirSync(pagesDir, { recursive: true }); + if (!fs.existsSync(jsonDir)) fs.mkdirSync(jsonDir, { recursive: true }); + } + + private async delay(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); + } + + private async simulateHumanBehavior(): Promise { + await this.driver.executeScript(`window.scrollBy(0, ${Math.floor(Math.random() * 300) + 100})`); + await this.delay(500 + Math.random() * 1000); + } + + public async scrapeAndStoreProfile(profileUrl: string): Promise { + try { + console.log(`Attempting to scrape: ${profileUrl}`); + + // Navigate to profile + await this.driver.get('https://www.google.com'); + await this.delay(1500); + await this.driver.executeScript(`window.location.href="${profileUrl}"`); + + // Check if page loaded properly + await this.delay(3000); + const currentUrl = await this.driver.getCurrentUrl(); + if (!currentUrl.includes('linkedin.com/in/')) { + throw new Error('Failed to load LinkedIn profile'); + } + + // Save HTML + await this.simulateHumanBehavior(); + const html = await this.driver.getPageSource(); + const profileId = profileUrl.split('/in/')[1].split('/')[0].split('?')[0]; + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const filename = `${profileId}_${timestamp}.html`; + + fs.writeFileSync(path.join(pagesDir, filename), html); + console.log(`Saved HTML: ${filename}`); + + // Convert to JSON and insert to DB + await this.convertToJson2(filename, profileUrl); + return true; + } catch (error) { + console.error(`Error scraping ${profileUrl}:`, error); + return false; + } + } + private getBestText($: CheerioAPI, selectors: string[]): string | null { + for (const selector of selectors) { + const text = $(selector).first().text().trim(); + if (text) return text; + } + return null; + } + + + + // This is just another method to try converting + + private async convertToJson2(htmlFilename: string, profileUrl: string): Promise { + const htmlPath = path.join(pagesDir, htmlFilename); + const profileId = htmlFilename.split('_')[0]; + const jsonPath = path.join(jsonDir, `${profileId}.json`); + + try { + const html = fs.readFileSync(htmlPath, 'utf-8'); + const $ = load(html); + + // 🔹 Robust selector sets + const fullName = this.getBestText($, [ + 'h1.top-card-layout__title', + 'h1.text-heading-xlarge', + 'h1', + ]); + + const jobTitle = this.getBestText($, [ + 'h2.top-card-layout__headline', + 'h2.text-body-medium', + 'h2', + ]); + + const locationText = this.getBestText($, [ + '.top-card-layout__first-subline span:first-child', + '.top-card__subline-item', + '.top-card-location', + ]); + + let city = null, state = null; + if (locationText) { + const parts = locationText.split(',').map(p => p.trim()); + [city, state] = [parts[0], parts[1]]; + } + + // 🔹 Education Section + let highSchool = null, hsGraduationYear = null; + let university: string | null = null; +let universityGradYear: string | null = null; +let degree: string | null = null; + let nafAcademy = null, nafTrackCertified = null; + + const educationSection = $('section[data-section="educationsDetails"], section:contains("Education")'); + if (educationSection.length === 0) { + console.warn(`No education section for ${htmlFilename}`); + } + + educationSection.find('li.education__list-item, li').each((_, el) => { + const textBlock = $(el).text().toLowerCase(); + const school = $(el).find('h3').first().text().trim(); + const degreeText = $(el).find('h4').first().text().trim(); + const duration = $(el).find('span.date-range').first().text().trim(); + const yearMatch = duration.match(/(\d{4})/); + const gradYear = yearMatch?.[1] ?? null; + + if (textBlock.includes('high school')) { + highSchool = school; + hsGraduationYear = gradYear; + } else { + university = university || school; + universityGradYear = universityGradYear || gradYear; + degree = degree || degreeText; + } + + if (school.toLowerCase().includes('academy of finance')) { + nafAcademy = school; + } + if (degreeText.toLowerCase().includes('naf track')) { + nafTrackCertified = degreeText; + } + }); + + // 🔹 Experience Section + let currentJob = null; + let internship_company1: string | null = null; + let internship_end_date1 = null; + let internship_company2: string | null = null; + let internship_end_date2 = null; + + const expSection = $('section[data-section="experience"], section:contains("Experience")'); + if (expSection.length === 0) { + console.warn(`No experience section for ${htmlFilename}`); + } + + expSection.find('ul > li').each((i, el) => { + const title = $(el).find('h3 span, h3').first().text().trim(); + const company = $(el).find('h4 span, h4').first().text().trim(); + const duration = $(el).find('span.date-range').first().text().trim(); + const endDateMatch = duration.match(/(\w+\s+\d{4}|\d{4})$/); + const endDate = endDateMatch?.[0] ?? null; + + if (i === 0 && title && company) { + currentJob = `${title} at ${company}`; + } + + if (/intern(ship)?/i.test(title)) { + if (!internship_company1) { + internship_company1 = company; + internship_end_date1 = endDate; + } else if (!internship_company2) { + internship_company2 = company; + internship_end_date2 = endDate; + } + } + }); + + // 🔹 Build JSON + const dbData = { + profile_url: profileUrl, + timestamp: new Date().toISOString(), + full_name: fullName, + email: null, + phone_number: null, + high_school: highSchool, + hs_graduation_year: hsGraduationYear, + naf_academy: nafAcademy, + naf_track_certified: nafTrackCertified, + address: null, + city, + state, + zip_code: null, + birthdate: null, + gender: null, + ethnicity: null, + military_branch_served: null, + current_job: currentJob || jobTitle, + college_major: null, + university_grad_year: universityGradYear, + university, + degree, + linkedin_link: profileUrl, + school_district: null, + internship_company1, + internship_end_date1, + internship_company2, + internship_end_date2, + university2: null, + college_major2: null, + degree2: null + }; + + fs.writeFileSync(jsonPath, JSON.stringify(dbData, null, 2)); + console.log(`Saved JSON: ${jsonPath}`); + + await this.insertToEnricherDatabase(dbData); + } catch (error) { + console.error(`Error converting ${htmlFilename}:`, error); + throw error; + } + } + + + private async convertToJson(htmlFilename: string, profileUrl: string): Promise { + const htmlPath = path.join(pagesDir, htmlFilename); + const profileId = htmlFilename.split('_')[0]; + const jsonPath = path.join(jsonDir, `${profileId}.json`); + + try { + const html = fs.readFileSync(htmlPath, 'utf-8'); + const $ = load(html); + + // Extract basic profile information - UPDATED SELECTORS + const fullName = this.getText($, 'h1.top-card-layout__title') || + $('h1').text().trim(); + + const jobTitle = this.getText($, 'h2.top-card-layout__headline') || + $('h2').text().trim(); + + const locationText = this.getText($, '.top-card-layout__first-subline > span:first-child') || + $('.top-card-location').text().trim(); + + // Parse location into city/state + let city = null; + let state = null; + if (locationText) { + const locationParts = locationText.split(', '); + city = locationParts[0] || null; + state = locationParts[1] || null; + } + + // Extract education information - UPDATED TO USE SECTIONS + let highSchool = null; + let hsGraduationYear = null; + let university = null; + let universityGradYear = null; + let degree = null; + let nafAcademy = null; + let nafTrackCertified = null; + + const educationSection = $('section[data-section="educationsDetails"]'); + if (educationSection.length > 0) { + educationSection.find('ul > li.education__list-item').each((_, el) => { + const school = $(el).find('h3').first().text().trim(); + const degreeText = $(el).find('h4').first().text().trim(); + const duration = $(el).find('span.date-range').first().text().trim(); + + const yearMatch = duration.match(/(\d{4})/); + const gradYear = yearMatch ? yearMatch[1] : null; + + if (school.toLowerCase().includes('high school')) { + highSchool = school; + hsGraduationYear = gradYear; + } else { + university = school; + universityGradYear = gradYear; + degree = degreeText; + } + + if (school.toLowerCase().includes('academy of finance')) { + nafAcademy = school; + } + if (degreeText.toLowerCase().includes('naf track')) { + nafTrackCertified = degreeText; + } + }); + } + + // Extract experience information - UPDATED TO USE SECTIONS + let currentJob = null; + let internshipCompany1: string | null = null; + let internship_end_date1 = null; + let internship_company2: string | null = null; + let internship_end_date2 = null; + + const expSection = $('section[data-section="experience"]'); + if (expSection.length > 0) { + expSection.find('ul > li').each((i, el) => { + const title = $(el).find('h3 span.experience-item__title').first().text().trim(); + const company = $(el).find('h4 span.experience-item__subtitle').first().text().trim(); + const duration = $(el).find('span.date-range').first().text().trim(); + + const endDateMatch = duration.match(/(\w+\s+\d{4}|\d{4})$/); + const endDate = endDateMatch ? endDateMatch[0] : null; + + if (i === 0) { + currentJob = title ? `${title} at ${company}` : company; + } + + if (title?.toLowerCase().includes('intern') || + title?.toLowerCase().includes('internship')) { + if (!internshipCompany1) { + internshipCompany1 = company; + internship_end_date1 = endDate; + } else if (!internship_company2) { + internship_company2 = company; + internship_end_date2 = endDate; + } + } + }); + } + + // Prepare data for database insertion + const dbData = { + profile_url: profileUrl, + timestamp: new Date().toISOString(), + full_name: fullName, + email: null, + phone_number: null, + high_school: highSchool, + hs_graduation_year: hsGraduationYear, + naf_academy: nafAcademy, + naf_track_certified: nafTrackCertified, + address: null, + city: city, + state: state, + zip_code: null, + birthdate: null, + gender: null, + ethnicity: null, + military_branch_served: null, + current_job: currentJob || jobTitle, + college_major: null, + university_grad_year: universityGradYear, + university: university, + degree: degree, + linkedin_link: profileUrl, + school_district: null, + internship_company1: internshipCompany1, + internship_end_date1: internship_end_date1, + internship_company2: internship_company2, + internship_end_date2: internship_end_date2, + university2: null, + college_major2: null, + degree2: null + }; + + // Save JSON + fs.writeFileSync(jsonPath, JSON.stringify(dbData, null, 2)); + console.log(`Saved JSON: ${jsonPath}`); + + // Insert to database + await this.insertToEnricherDatabase(dbData); + } catch (error) { + console.error(`Error converting ${htmlFilename}:`, error); + throw error; + } + } + +// Add this helper function at class level +private getText($: CheerioAPI, selector: string): string | null { + const element = $(selector).first(); + return element.length ? element.text().trim() : null; +} + private async insertToEnricherDatabase(data: any): Promise { + const client = new Client({ + host: process.env.PGHOST, + port: parseInt(process.env.PGPORT || '5432'), + user: process.env.PGUSER, + password: process.env.PGPASSWORD, + database: process.env.PGDATABASE + }); + + try { + await client.connect(); + await client.query(` + INSERT INTO enricher_data ( + profile_url, timestamp, full_name, email, phone_number, + high_school, hs_graduation_year, naf_academy, naf_track_certified, + address, city, state, zip_code, birthdate, gender, ethnicity, + military_branch_served, current_job, college_major, university_grad_year, + university, degree, linkedin_link, school_district, + internship_company1, internship_end_date1, + internship_company2, internship_end_date2, + university2, college_major2, degree2 + ) VALUES ( + $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, + $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, + $29, $30, $31 + ) + `, [ + data.profile_url, + data.timestamp, + data.full_name, + data.email, + data.phone_number, + data.high_school, + data.hs_graduation_year, + data.naf_academy, + data.naf_track_certified, + data.address, + data.city, + data.state, + data.zip_code, + data.birthdate, + data.gender, + data.ethnicity, + data.military_branch_served, + data.current_job, + data.college_major, + data.university_grad_year, + data.university, + data.degree, + data.linkedin_link, + data.school_district, + data.internship_company1, + data.internship_end_date1, + data.internship_company2, + data.internship_end_date2, + data.university2, + data.college_major2, + data.degree2 + ]); + console.log('Data inserted into enricher_data'); + } catch (error) { + console.error('Error inserting into enricher_data:', error); + throw error; + } finally { + await client.end(); + } + } +} \ No newline at end of file diff --git a/crawler/enricher.ts b/crawler/enricher.ts index 90cf9f3..62747c7 100644 --- a/crawler/enricher.ts +++ b/crawler/enricher.ts @@ -3,6 +3,10 @@ import chrome, { ServiceBuilder } from 'selenium-webdriver/chrome'; //we are sup import chromedriver from 'chromedriver'; import { Client } from 'pg'; import 'dotenv/config'; // get the dot env file +import { EnricherLinkedInScraper } from './LinkedinScraper'; +import { ProfileData } from './html_json'; +import * as path from 'path'; +import * as fs from 'fs'; const client = new Client({ host: process.env.PGHOST, @@ -11,8 +15,8 @@ const client = new Client({ password: process.env.PGPASSWORD, database: process.env.PGDATABASE }); - - async function main(){ +// this is the function to try finding the person in cralwer database. + async function findInDb(){ try{ await client.connect(); @@ -74,7 +78,7 @@ const client = new Client({ data.internshipCompany1 // internship_company1 ]); - console.log(`Successfully enriched data for ${fullName}`); + console.log(`Successfully enriched data for ${fullName}`); } catch (err) { @@ -85,107 +89,144 @@ const client = new Client({ } } - main(); + findInDb(); + + -const serviceBuilder = new chrome.ServiceBuilder(chromedriver.path); -const Options = chrome.Options; //The Chrome-specific configuration options you can pass to customize how Chrome runs when used with Selenium WebDriver. -interface Person{ - fullName: String; + const serviceBuilder = new chrome.ServiceBuilder(chromedriver.path); + const Options = chrome.Options; //The Chrome-specific configuration options you can pass to customize how Chrome runs when used with Selenium WebDriver. + + interface Person { + fullName: string; email?: string; phone?: string; school?: string; -} + currentJob?: string; + city?: string; + highSchool?: string; + degree?: string; + } -interface SearchResult { + interface SearchResult { items?: { link: string }[]; error?: any; } -const person: Person = { - fullName: 'Shahreen Iqbal', - email: 'singhbarkat1011@gmail.com', - phone: '', - school: 'University Of Texas, Dallas', -} + /* */ -async function findLinkedinProfile(person: Person): Promise { - const { fullName, school } = person; - const [firstName, ...lastNameParts] = fullName.split(" "); - const lastName = lastNameParts.join(" "); + + async function findLinkedinProfile(person: Person): Promise { + const { fullName, school, currentJob, city, highSchool, degree } = person; + const apiKey = process.env.API_KEY; const searchEngineId = process.env.SEARCH_ENGINE_ID; - + if (!apiKey || !searchEngineId) { - console.error('API_KEY or SEARCH_ENGINE_ID not set in .env'); - process.exit(1); + console.error('API_KEY or SEARCH_ENGINE_ID not set in .env'); + process.exit(1); } + + let queryParts = [`site:linkedin.com/in "${fullName}"`]; + if (school) queryParts.push(school); + if (currentJob) queryParts.push(currentJob); + if (city) queryParts.push(city); + if (highSchool) queryParts.push(highSchool); + if (degree) queryParts.push(degree); - const queryVariations = [ - `site:linkedin.com/in "${fullName}"`, - `site:linkedin.com/in "${firstName} ${lastName}"`, - school ? `site:linkedin.com/in "${fullName}" "${school}"` : '', - school ? `site:linkedin.com/in "${firstName} ${lastName}" "${school}"` : '' - ].filter(Boolean); // Remove empty strings +const query = queryParts.join(" "); + + const url = `https://www.googleapis.com/customsearch/v1?key=${apiKey}&cx=${searchEngineId}&q=${encodeURIComponent(query)}&num=5`; + const chromeOptions = new Options(); - chromeOptions.addArguments( - '--headless=new', - '--disable-gpu', - '--no-sandbox', - '--disable-application-cache', - '--disable-extensions', - '--disable-notifications', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disk-cache-size=0', - '--media-cache-size=0', - '--aggressive-cache-discard' - ); - + chromeOptions.addArguments('--headless=new', '--disable-gpu', '--no-sandbox'); + const driver = await new Builder() - .forBrowser('chrome') - .setChromeOptions(chromeOptions) - .setChromeService(serviceBuilder) - .build(); - + .forBrowser('chrome') + .setChromeOptions(chromeOptions) + .setChromeService(serviceBuilder) + .build(); + try { - const allResults: any[] = []; - const seenUrls = new Set(); - - for (const query of queryVariations) { - console.log(`Searching with query: ${query}`); - const url = `https://www.googleapis.com/customsearch/v1?key=${apiKey}&cx=${searchEngineId}&q=${encodeURIComponent(query)}&num=5&${Date.now()}`; - - const data = await driver.executeAsyncScript(function (url: string, callback: Function) { - fetch(url) - .then(res => res.json()) - .then(result => callback(result)) - .catch(err => callback({ error: err.toString() })); - }, url) as SearchResult; - - const results = (data.items || []).filter((item: any) => - item.link.includes('linkedin.com/in') && !seenUrls.has(item.link) - ); - - for (const result of results) { - seenUrls.add(result.link); - allResults.push(result); - } - } - - if (allResults.length === 0) { - console.log('No LinkedIn profiles found.'); - } else { - console.log(`Top LinkedIn Matches for "${fullName}":`); - allResults.forEach((item, index) => { - console.log(`${index + 1}. ${item.link}`); - }); - } + const data = await driver.executeAsyncScript(function (url: string, callback: Function) { + fetch(url) + .then(res => res.json()) + .then(result => callback(result)) + .catch(err => callback({ error: err.toString() })); + }, url) as SearchResult; + + const results = (data.items || []).filter(item => + item.link.includes('linkedin.com/in') + ); + + if (results.length === 0) { + console.log(`No LinkedIn profiles found for "${fullName}".`); + return null; + } else { + console.log(`Top LinkedIn Matches for "${fullName}":`); + results.forEach((item, index) => { + console.log(`${index + 1}. ${item.link}`); + }); + const firstProfileUrl = results[0].link; + + return firstProfileUrl; + } } finally { - await driver.quit(); + await driver.quit(); } -} - -findLinkedinProfile(person).catch(console.error); \ No newline at end of file + + } + + async function findOnLinkedin() { + try { + + + // 👇 change this to the person you want to test + const fullName = 'Luke Edwards'; + + const res = await client.query( + `SELECT json FROM crawler_data WHERE json->>'fullName' = $1 LIMIT 1`, + [fullName] + ); + + if (res.rows.length === 0) { + console.log(`No data found for ${fullName}`); + return; + } + + const data = res.rows[0].json; + + const person: Person = { + fullName: data.fullName, + email: data.email, + phone: data.phoneNumber, + school: data.university || data.highSchool, + currentJob: data.currentJob, + city: data.city, + highSchool: data.highSchool, + degree: data.degree, + }; + + const profileUrl = await findLinkedinProfile(person); + if (!profileUrl) return; + // Initialize scraper and scrape the profile + const chromeOptions = new Options(); + chromeOptions.addArguments('--headless=new', '--disable-gpu', '--no-sandbox'); + const driver = await new Builder() + .forBrowser('chrome') + .setChromeOptions(chromeOptions) + .setChromeService(serviceBuilder) + .build(); + + const scraper = new EnricherLinkedInScraper(driver); + await scraper.scrapeAndStoreProfile(profileUrl); + } catch (err) { + console.error('Error:', err); + } finally { + await client.end(); + } + } + + findOnLinkedin().catch(console.error); \ No newline at end of file